From 89077760d4918df5bd1080b6311a456879c6c075 Mon Sep 17 00:00:00 2001 From: Luca Carrogu Date: Mon, 16 Nov 2020 18:15:50 +0100 Subject: [PATCH 01/66] Add new test runner parameter `--createami-custom-node-url` The new parameter allows to specify a custom node for the createami test. This new parameter permits to specify a custom node URL, that is needed when version bump is done and node package is not yet present in PyPi. Signed-off-by: Luca Carrogu --- tests/integration-tests/README.md | 5 ++++- tests/integration-tests/conftest.py | 1 + tests/integration-tests/test_runner.py | 10 ++++++++++ .../tests/createami/test_createami.py | 12 ++++++++++-- 4 files changed, 25 insertions(+), 3 deletions(-) diff --git a/tests/integration-tests/README.md b/tests/integration-tests/README.md index 7cc9b9a70c..65dc0e908b 100644 --- a/tests/integration-tests/README.md +++ b/tests/integration-tests/README.md @@ -39,7 +39,8 @@ python -m test_runner --help usage: test_runner.py [-h] --key-name KEY_NAME --key-path KEY_PATH [-n PARALLELISM] [--sequential] [--credential CREDENTIAL] [--retry-on-failures] [--tests-root-dir TESTS_ROOT_DIR] [-c TESTS_CONFIG] [-i [INSTANCES [INSTANCES ...]]] [-o [OSS [OSS ...]]] [-s [SCHEDULERS [SCHEDULERS ...]]] [-r [REGIONS [REGIONS ...]]] [-f FEATURES [FEATURES ...]] [--show-output] [--reports {html,junitxml,json,cw} [{html,junitxml,json,cw} ...]] [--cw-region CW_REGION] [--cw-namespace CW_NAMESPACE] [--cw-timestamp-day-start] [--output-dir OUTPUT_DIR] - [--custom-node-url CUSTOM_NODE_URL] [--custom-cookbook-url CUSTOM_COOKBOOK_URL] [--createami-custom-cookbook-url CREATEAMI_CUSTOM_COOKBOOK_URL] [--custom-template-url CUSTOM_TEMPLATE_URL] + [--custom-node-url CUSTOM_NODE_URL] [--custom-cookbook-url CUSTOM_COOKBOOK_URL] [--createami-custom-cookbook-url CREATEAMI_CUSTOM_COOKBOOK_URL] + [--createami-custom-node-url CREATEAMI_CUSTOM_NODE_URL] [--custom-template-url CUSTOM_TEMPLATE_URL] [--custom-hit-template-url CUSTOM_HIT_TEMPLATE_URL] [--custom-awsbatchcli-url CUSTOM_AWSBATCHCLI_URL] [--custom-ami CUSTOM_AMI] [--pre-install PRE_INSTALL] [--post-install POST_INSTALL] [--benchmarks] [--benchmarks-target-capacity BENCHMARKS_TARGET_CAPACITY] [--benchmarks-max-time BENCHMARKS_MAX_TIME] [--vpc-stack VPC_STACK] [--cluster CLUSTER] [--no-delete] [--keep-logs-on-cluster-failure] [--keep-logs-on-test-failure] [--stackname-suffix STACKNAME_SUFFIX] [--dry-run] @@ -98,6 +99,8 @@ Custom packages and templates: URL to a custom cookbook package. (default: None) --createami-custom-cookbook-url CREATEAMI_CUSTOM_COOKBOOK_URL URL to a custom cookbook package for the createami command. (default: None) + --createami-custom-node-url CREATEAMI_CUSTOM_NODE_URL + URL to a custom node package for the createami command. (default: None) --custom-template-url CUSTOM_TEMPLATE_URL URL to a custom cfn template. (default: None) --custom-hit-template-url CUSTOM_HIT_TEMPLATE_URL diff --git a/tests/integration-tests/conftest.py b/tests/integration-tests/conftest.py index 9c550762bd..5b27aa7b4f 100644 --- a/tests/integration-tests/conftest.py +++ b/tests/integration-tests/conftest.py @@ -70,6 +70,7 @@ def pytest_addoption(parser): parser.addoption( "--createami-custom-chef-cookbook", help="url to a custom cookbook package for the createami command" ) + parser.addoption("--createami-custom-node-package", help="url to a custom node package for the createami command") parser.addoption("--custom-awsbatch-template-url", help="url to a custom awsbatch template") parser.addoption("--template-url", help="url to a custom cfn template") parser.addoption("--hit-template-url", help="url to a custom HIT cfn template") diff --git a/tests/integration-tests/test_runner.py b/tests/integration-tests/test_runner.py index 700476d3cf..b2e8fc7725 100644 --- a/tests/integration-tests/test_runner.py +++ b/tests/integration-tests/test_runner.py @@ -53,6 +53,7 @@ "custom_node_url": None, "custom_cookbook_url": None, "createami_custom_cookbook_url": None, + "createami_custom_node_url": None, "custom_template_url": None, "custom_awsbatchcli_url": None, "custom_hit_template_url": None, @@ -212,6 +213,12 @@ def _init_argparser(): default=TEST_DEFAULTS.get("createami_custom_cookbook_url"), type=_is_url, ) + custom_group.add_argument( + "--createami-custom-node-url", + help="URL to a custom node package for the createami command.", + default=TEST_DEFAULTS.get("createami_custom_node_url"), + type=_is_url, + ) custom_group.add_argument( "--custom-template-url", help="URL to a custom cfn template.", @@ -426,6 +433,9 @@ def _set_custom_packages_args(args, pytest_args): # noqa: C901 if args.createami_custom_cookbook_url: pytest_args.extend(["--createami-custom-chef-cookbook", args.createami_custom_cookbook_url]) + if args.createami_custom_node_url: + pytest_args.extend(["--createami-custom-node-package", args.createami_custom_node_url]) + if args.custom_template_url: pytest_args.extend(["--template-url", args.custom_template_url]) diff --git a/tests/integration-tests/tests/createami/test_createami.py b/tests/integration-tests/tests/createami/test_createami.py index 6d51318139..f94572169e 100644 --- a/tests/integration-tests/tests/createami/test_createami.py +++ b/tests/integration-tests/tests/createami/test_createami.py @@ -11,6 +11,7 @@ # See the License for the specific language governing permissions and limitations under the License. import logging +from os import environ import pytest from assertpy import assert_that @@ -20,7 +21,6 @@ from tests.common.utils import get_installed_parallelcluster_version, retrieve_latest_ami -@pytest.mark.skip(reason="Temporarily disable this test") @pytest.mark.dimensions("eu-west-1", "c5.xlarge", "alinux", "*") @pytest.mark.dimensions("us-west-1", "c5.xlarge", "alinux2", "*") @pytest.mark.dimensions("us-west-2", "c5.xlarge", "centos7", "*") @@ -46,6 +46,13 @@ def test_createami(region, os, instance, request, pcluster_config_reader, vpc_st custom_cookbook = request.config.getoption("createami_custom_chef_cookbook") custom_cookbook_args = [] if not custom_cookbook else ["-cc", custom_cookbook] + # Custom Node + # inject PARALLELCLUSTER_NODE_URL into packer environment + custom_node = request.config.getoption("createami_custom_node_package") + if custom_node: + env = environ.copy() + env["PARALLELCLUSTER_NODE_URL"] = custom_node + # Instance type pcluster_version_result = run_command(["pcluster", "version"]) instance_args = ( @@ -56,7 +63,8 @@ def test_createami(region, os, instance, request, pcluster_config_reader, vpc_st ["pcluster", "createami", "-ai", base_ami, "-os", os, "-r", region, "-c", cluster_config.as_posix()] + custom_cookbook_args + instance_args - + networking_args + + networking_args, + env=env, ) stdout_lower = pcluster_createami_result.stdout.lower() From 7cc99993d1a0a5184c5abfd36d5f95d35954a597 Mon Sep 17 00:00:00 2001 From: Enrico Usai Date: Thu, 19 Nov 2020 13:17:32 +0100 Subject: [PATCH 02/66] Remove old CentOS6 references Signed-off-by: Enrico Usai --- tests/integration-tests/README.md | 1 - tests/integration-tests/tests/common/utils.py | 2 +- tests/integration-tests/tests/storage/test_fsx_lustre.py | 2 -- tests/integration-tests/tests/tags/test_tag_propagation.py | 2 +- 4 files changed, 2 insertions(+), 5 deletions(-) diff --git a/tests/integration-tests/README.md b/tests/integration-tests/README.md index 65dc0e908b..9e4b9fff85 100644 --- a/tests/integration-tests/README.md +++ b/tests/integration-tests/README.md @@ -229,7 +229,6 @@ cloudwatch_logging suite defined above will produce the following parametrizatio ``` cloudwatch_logging/test_cloudwatch_logging.py::test_cloudwatch_logging[ap-east-1-c5.xlarge-alinux-slurm] cloudwatch_logging/test_cloudwatch_logging.py::test_cloudwatch_logging[ap-east-1-c5.xlarge-alinux2-slurm] -cloudwatch_logging/test_cloudwatch_logging.py::test_cloudwatch_logging[ap-east-1-c5.xlarge-centos6-slurm] cloudwatch_logging/test_cloudwatch_logging.py::test_cloudwatch_logging[ap-east-1-c5.xlarge-centos7-slurm] cloudwatch_logging/test_cloudwatch_logging.py::test_cloudwatch_logging[ap-east-1-c5.xlarge-ubuntu1604-slurm] cloudwatch_logging/test_cloudwatch_logging.py::test_cloudwatch_logging[ap-east-1-c5.xlarge-ubuntu1804-slurm] diff --git a/tests/integration-tests/tests/common/utils.py b/tests/integration-tests/tests/common/utils.py index 3eccc4a09d..d9ee887512 100644 --- a/tests/integration-tests/tests/common/utils.py +++ b/tests/integration-tests/tests/common/utils.py @@ -52,8 +52,8 @@ OS_TO_PCLUSTER_AMI_NAME_OWNER_MAP = { "alinux": {"name": "amzn-hvm-x86_64-*", "owners": ["amazon"]}, "alinux2": {"name": "amzn2-hvm-*-*", "owners": ["amazon"]}, - "centos6": {"name": "centos6-hvm-x86_64-*", "owners": ["amazon"]}, "centos7": {"name": "centos7-hvm-x86_64-*", "owners": ["amazon"]}, + "centos8": {"name": "centos8-hvm-x86_64-*", "owners": ["amazon"]}, "ubuntu1604": {"name": "ubuntu-1604-lts-hvm-x86_64-*", "owners": ["amazon"]}, "ubuntu1804": {"name": "ubuntu-1804-lts-hvm-*-*", "owners": ["amazon"]}, } diff --git a/tests/integration-tests/tests/storage/test_fsx_lustre.py b/tests/integration-tests/tests/storage/test_fsx_lustre.py index bec49993af..fa81c5dd1f 100644 --- a/tests/integration-tests/tests/storage/test_fsx_lustre.py +++ b/tests/integration-tests/tests/storage/test_fsx_lustre.py @@ -54,8 +54,6 @@ @pytest.mark.instances(["c5.xlarge"]) @pytest.mark.schedulers(["slurm"]) @pytest.mark.usefixtures("instance") -# FSx is not supported on CentOS 6 -@pytest.mark.skip_oss(["centos6"]) def test_fsx_lustre_configuration_options( deployment_type, per_unit_storage_throughput, diff --git a/tests/integration-tests/tests/tags/test_tag_propagation.py b/tests/integration-tests/tests/tags/test_tag_propagation.py index 5540410ff2..6b5e75c04c 100644 --- a/tests/integration-tests/tests/tags/test_tag_propagation.py +++ b/tests/integration-tests/tests/tags/test_tag_propagation.py @@ -190,8 +190,8 @@ def get_root_volume_id(instance_id, region, os): logging.info("Getting root volume for instance %s", instance_id) os_to_root_volume_device = { # These are taken from the main CFN template - "centos6": "/dev/sda1", "centos7": "/dev/sda1", + "centos8": "/dev/sda1", "alinux": "/dev/xvda", "alinux2": "/dev/xvda", "ubuntu1604": "/dev/sda1", From 949e5ce35db249c11892856df4bf149317b53a62 Mon Sep 17 00:00:00 2001 From: Francesco De Martino Date: Thu, 19 Nov 2020 10:23:14 +0100 Subject: [PATCH 03/66] Refactor package structure and improve tox testenv commands * Refactor src and tests structure according to https://docs.pytest.org/en/stable/goodpractices.html * Differentiated between tests with coverage and tests without coverage. Run without coverage with all supported Python version and test against the installed version of the CLI (installing from the sdist package). Run with coverage only for Python 3.8 - when running with coverage tests are executed against the package installed in development mode. * Added Python 3.9 to tests * Grouped all travis tasks in a single stage so that the run is faster * Updated setup.py file to reflect the new structure and to add missing project information Signed-off-by: Francesco De Martino --- .travis.yml | 35 ++++++++++++------ cli/.flake8 | 10 ++--- cli/MANIFEST.in | 5 +-- cli/setup.py | 21 +++++++++-- cli/{ => src}/awsbatch/__init__.py | 0 cli/{ => src}/awsbatch/awsbhosts.py | 0 cli/{ => src}/awsbatch/awsbkill.py | 0 cli/{ => src}/awsbatch/awsbout.py | 0 cli/{ => src}/awsbatch/awsbqueues.py | 0 cli/{ => src}/awsbatch/awsbstat.py | 0 cli/{ => src}/awsbatch/awsbsub.py | 0 cli/{ => src}/awsbatch/common.py | 0 .../awsbatch/examples/awsbatch-cli.cfg | 0 cli/{ => src}/awsbatch/utils.py | 0 cli/{ => src}/pcluster/__init__.py | 0 cli/{ => src}/pcluster/cli.py | 0 .../pcluster/cli_commands/__init__.py | 0 .../compute_fleet_status_manager.py | 0 cli/{ => src}/pcluster/cli_commands/delete.py | 0 cli/{ => src}/pcluster/cli_commands/start.py | 0 cli/{ => src}/pcluster/cli_commands/stop.py | 0 cli/{ => src}/pcluster/cli_commands/update.py | 0 cli/{ => src}/pcluster/cluster_model.py | 0 cli/{ => src}/pcluster/commands.py | 0 cli/{ => src}/pcluster/config/__init__.py | 0 .../pcluster/config/cfn_param_types.py | 0 cli/{ => src}/pcluster/config/config_patch.py | 0 .../pcluster/config/hit_converter.py | 0 .../pcluster/config/iam_policy_rules.py | 0 .../pcluster/config/json_param_types.py | 0 cli/{ => src}/pcluster/config/mappings.py | 0 cli/{ => src}/pcluster/config/param_types.py | 0 .../pcluster/config/pcluster_config.py | 0 cli/{ => src}/pcluster/config/resource_map.py | 0 .../pcluster/config/update_policy.py | 0 cli/{ => src}/pcluster/config/validators.py | 0 cli/{ => src}/pcluster/configure/__init__.py | 0 .../pcluster/configure/easyconfig.py | 8 ++-- .../pcluster/configure/networking.py | 0 .../pcluster/configure/subnet_computation.py | 0 cli/{ => src}/pcluster/configure/utils.py | 0 cli/{ => src}/pcluster/constants.py | 0 cli/{ => src}/pcluster/createami.py | 0 cli/{ => src}/pcluster/dcv/__init__.py | 0 cli/{ => src}/pcluster/dcv/connect.py | 0 cli/{ => src}/pcluster/dcv/utils.py | 0 cli/{ => src}/pcluster/examples/config | 0 cli/{ => src}/pcluster/examples/job.sh | 0 cli/{ => src}/pcluster/models/__init__.py | 0 cli/{ => src}/pcluster/models/hit/__init__.py | 0 .../pcluster/models/hit/hit_cluster_model.py | 0 cli/{ => src}/pcluster/models/sit/__init__.py | 0 .../pcluster/models/sit/sit_cluster_model.py | 0 cli/{ => src}/pcluster/networking/__init__.py | 0 .../pcluster/networking/vpc_factory.py | 0 .../pcluster/resources/batch/docker/alinux | 0 .../resources/batch/docker/alinux2/Dockerfile | 0 .../batch/docker/build-docker-images.sh | 0 .../resources/batch/docker/buildspec.yml | 0 .../batch/docker/scripts/entrypoint.sh | 0 .../batch/docker/scripts/generate_hostfile.sh | 0 .../batch/docker/scripts/mount_efs.sh | 0 .../batch/docker/scripts/mount_nfs.sh | 0 .../batch/docker/upload-docker-images.sh | 0 .../custom_resources_code/__init__.py | 0 .../cleanup_resources.py | 0 .../custom_resources_code/crhelper/LICENSE | 0 .../custom_resources_code/crhelper/NOTICE | 0 .../crhelper/__init__.py | 0 .../crhelper/log_helper.py | 0 .../crhelper/resource_helper.py | 0 .../custom_resources_code/crhelper/utils.py | 0 .../manage_docker_images.py | 0 .../send_build_notification.py | 0 .../custom_resources_code/wait_for_update.py | 0 cli/{ => src}/pcluster/utils.py | 0 cli/{ => src}/pcluster_config/__init__.py | 0 cli/{ => src}/pcluster_config/cli.py | 7 ++-- cli/tests/conftest.py | 4 +- cli/tests/pcluster/config/utils.py | 2 +- cli/tests/pcluster/configure/__init__.py | 10 +++++ cli/tests/pcluster/createami/__init__.py | 10 +++++ cli/tox.ini | 37 +++++++++---------- 83 files changed, 94 insertions(+), 55 deletions(-) rename cli/{ => src}/awsbatch/__init__.py (100%) rename cli/{ => src}/awsbatch/awsbhosts.py (100%) rename cli/{ => src}/awsbatch/awsbkill.py (100%) rename cli/{ => src}/awsbatch/awsbout.py (100%) rename cli/{ => src}/awsbatch/awsbqueues.py (100%) rename cli/{ => src}/awsbatch/awsbstat.py (100%) rename cli/{ => src}/awsbatch/awsbsub.py (100%) rename cli/{ => src}/awsbatch/common.py (100%) rename cli/{ => src}/awsbatch/examples/awsbatch-cli.cfg (100%) rename cli/{ => src}/awsbatch/utils.py (100%) rename cli/{ => src}/pcluster/__init__.py (100%) rename cli/{ => src}/pcluster/cli.py (100%) rename cli/{ => src}/pcluster/cli_commands/__init__.py (100%) rename cli/{ => src}/pcluster/cli_commands/compute_fleet_status_manager.py (100%) rename cli/{ => src}/pcluster/cli_commands/delete.py (100%) rename cli/{ => src}/pcluster/cli_commands/start.py (100%) rename cli/{ => src}/pcluster/cli_commands/stop.py (100%) rename cli/{ => src}/pcluster/cli_commands/update.py (100%) rename cli/{ => src}/pcluster/cluster_model.py (100%) rename cli/{ => src}/pcluster/commands.py (100%) rename cli/{ => src}/pcluster/config/__init__.py (100%) rename cli/{ => src}/pcluster/config/cfn_param_types.py (100%) rename cli/{ => src}/pcluster/config/config_patch.py (100%) rename cli/{ => src}/pcluster/config/hit_converter.py (100%) rename cli/{ => src}/pcluster/config/iam_policy_rules.py (100%) rename cli/{ => src}/pcluster/config/json_param_types.py (100%) rename cli/{ => src}/pcluster/config/mappings.py (100%) rename cli/{ => src}/pcluster/config/param_types.py (100%) rename cli/{ => src}/pcluster/config/pcluster_config.py (100%) rename cli/{ => src}/pcluster/config/resource_map.py (100%) rename cli/{ => src}/pcluster/config/update_policy.py (100%) rename cli/{ => src}/pcluster/config/validators.py (100%) rename cli/{ => src}/pcluster/configure/__init__.py (100%) rename cli/{ => src}/pcluster/configure/easyconfig.py (99%) rename cli/{ => src}/pcluster/configure/networking.py (100%) rename cli/{ => src}/pcluster/configure/subnet_computation.py (100%) rename cli/{ => src}/pcluster/configure/utils.py (100%) rename cli/{ => src}/pcluster/constants.py (100%) rename cli/{ => src}/pcluster/createami.py (100%) rename cli/{ => src}/pcluster/dcv/__init__.py (100%) rename cli/{ => src}/pcluster/dcv/connect.py (100%) rename cli/{ => src}/pcluster/dcv/utils.py (100%) rename cli/{ => src}/pcluster/examples/config (100%) rename cli/{ => src}/pcluster/examples/job.sh (100%) rename cli/{ => src}/pcluster/models/__init__.py (100%) rename cli/{ => src}/pcluster/models/hit/__init__.py (100%) rename cli/{ => src}/pcluster/models/hit/hit_cluster_model.py (100%) rename cli/{ => src}/pcluster/models/sit/__init__.py (100%) rename cli/{ => src}/pcluster/models/sit/sit_cluster_model.py (100%) rename cli/{ => src}/pcluster/networking/__init__.py (100%) rename cli/{ => src}/pcluster/networking/vpc_factory.py (100%) rename cli/{ => src}/pcluster/resources/batch/docker/alinux (100%) rename cli/{ => src}/pcluster/resources/batch/docker/alinux2/Dockerfile (100%) rename cli/{ => src}/pcluster/resources/batch/docker/build-docker-images.sh (100%) rename cli/{ => src}/pcluster/resources/batch/docker/buildspec.yml (100%) rename cli/{ => src}/pcluster/resources/batch/docker/scripts/entrypoint.sh (100%) rename cli/{ => src}/pcluster/resources/batch/docker/scripts/generate_hostfile.sh (100%) rename cli/{ => src}/pcluster/resources/batch/docker/scripts/mount_efs.sh (100%) rename cli/{ => src}/pcluster/resources/batch/docker/scripts/mount_nfs.sh (100%) rename cli/{ => src}/pcluster/resources/batch/docker/upload-docker-images.sh (100%) rename cli/{ => src}/pcluster/resources/custom_resources/custom_resources_code/__init__.py (100%) rename cli/{ => src}/pcluster/resources/custom_resources/custom_resources_code/cleanup_resources.py (100%) rename cli/{ => src}/pcluster/resources/custom_resources/custom_resources_code/crhelper/LICENSE (100%) rename cli/{ => src}/pcluster/resources/custom_resources/custom_resources_code/crhelper/NOTICE (100%) rename cli/{ => src}/pcluster/resources/custom_resources/custom_resources_code/crhelper/__init__.py (100%) rename cli/{ => src}/pcluster/resources/custom_resources/custom_resources_code/crhelper/log_helper.py (100%) rename cli/{ => src}/pcluster/resources/custom_resources/custom_resources_code/crhelper/resource_helper.py (100%) rename cli/{ => src}/pcluster/resources/custom_resources/custom_resources_code/crhelper/utils.py (100%) rename cli/{ => src}/pcluster/resources/custom_resources/custom_resources_code/manage_docker_images.py (100%) rename cli/{ => src}/pcluster/resources/custom_resources/custom_resources_code/send_build_notification.py (100%) rename cli/{ => src}/pcluster/resources/custom_resources/custom_resources_code/wait_for_update.py (100%) rename cli/{ => src}/pcluster/utils.py (100%) rename cli/{ => src}/pcluster_config/__init__.py (100%) rename cli/{ => src}/pcluster_config/cli.py (97%) create mode 100644 cli/tests/pcluster/configure/__init__.py create mode 100644 cli/tests/pcluster/createami/__init__.py diff --git a/.travis.yml b/.travis.yml index 61643625e6..b9130aab4a 100644 --- a/.travis.yml +++ b/.travis.yml @@ -2,27 +2,40 @@ language: python sudo: required dist: xenial -python: - - "2.7" - - "3.4" - - "3.5" - - "3.6" - - "3.7" - - "3.8" - matrix: include: + - name: Python 2.7 Tests + python: 2.7 + env: TOXENV=py27-nocov + - name: Python 3.4 Tests + python: 3.4 + env: TOXENV=py34-nocov + - name: Python 3.5 Tests + python: 3.5 + env: TOXENV=py35-nocov + - name: Python 3.6 Tests + python: 3.6 + env: TOXENV=py36-nocov + - name: Python 3.7 Tests + python: 3.7 + env: TOXENV=py37-nocov + - name: Python 3.8 Tests + python: 3.8 + env: TOXENV=py38-nocov + - name: Python 3.9 Tests + python: 3.9 + env: TOXENV=py39-nocov + - name: Python 3.8 Tests Coverage + python: 3.8 + env: TOXENV=py38-cov - name: Code Checks python: 3.6 - stage: linters env: TOXENV=code-linters - name: CloudFormation Templates Checks python: 3.6 - stage: linters env: TOXENV=cfn-format-check,cfn-lint,cfn-tests - name: Validate integration tests configs python: 3.6 - stage: linters env: TOXENV=validate-test-configs script: cd tests/integration-tests && tox diff --git a/cli/.flake8 b/cli/.flake8 index 052dc1fc72..fef48e04b6 100644 --- a/cli/.flake8 +++ b/cli/.flake8 @@ -7,7 +7,6 @@ ignore = D103, # Missing docstring in public function W503, # line break before binary operator => Conflicts with black style. D413, # Missing blank line after last section -# D103 Missing docstring in public function # E402 module level import not at top of file # D101 Missing docstring in public class # D102 Missing docstring in public method @@ -15,12 +14,9 @@ ignore = # D400 First line should end with a period # D401 First line should be in imperative mood per-file-ignores = - pcluster/configure/easyconfig.py: E402 - pcluster/utils.py: E402 - tests/pcluster/pcluster-unittest.py: D101, D102 - tests/pcluster/configure/test_*.py: D101, D102 - tests/pcluster/*/test_*.py: D101, D102 - tests/awsbatch/test_*.py: D101, D102 + src/pcluster/configure/easyconfig.py: E402 + src/pcluster/utils.py: E402 + tests/*: D101, D102 ../tests/integration-tests/*: D205, D400, D401 exclude = .tox, diff --git a/cli/MANIFEST.in b/cli/MANIFEST.in index aa190a4faa..977b1cd061 100644 --- a/cli/MANIFEST.in +++ b/cli/MANIFEST.in @@ -1,4 +1,3 @@ -recursive-include pcluster/cloudformation * -recursive-include pcluster/examples * -recursive-include pcluster/resources * +recursive-include src/pcluster/examples * +recursive-include src/pcluster/resources * recursive-exclude tests * diff --git a/cli/setup.py b/cli/setup.py index c330554e95..d5b3a752cf 100644 --- a/cli/setup.py +++ b/cli/setup.py @@ -46,8 +46,9 @@ def readme(): "and manage HPC clusters in the AWS cloud.", url="https://github.com/aws/aws-parallelcluster", license="Apache License 2.0", - packages=find_packages(), - python_requires=">=2.7", + package_dir={"": "src"}, + packages=find_packages("src"), + python_requires=">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*", install_requires=REQUIRES, entry_points={ "console_scripts": [ @@ -63,13 +64,27 @@ def readme(): }, include_package_data=True, zip_safe=False, - package_data={"": ["examples/config"]}, + package_data={"": ["src/examples/config"]}, long_description=readme(), classifiers=[ "Development Status :: 5 - Production/Stable", "Environment :: Console", "Programming Language :: Python", + "Programming Language :: Python :: 2", + "Programming Language :: Python :: 2.7", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.4", + "Programming Language :: Python :: 3.5", + "Programming Language :: Python :: 3.6", + "Programming Language :: Python :: 3.7", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", "Topic :: Scientific/Engineering", "License :: OSI Approved :: Apache Software License", ], + project_urls={ + "Changelog": "https://github.com/aws/aws-parallelcluster/blob/develop/CHANGELOG.md", + "Issue Tracker": "https://github.com/aws/aws-parallelcluster/issues", + "Documentation": "https://docs.aws.amazon.com/parallelcluster/", + }, ) diff --git a/cli/awsbatch/__init__.py b/cli/src/awsbatch/__init__.py similarity index 100% rename from cli/awsbatch/__init__.py rename to cli/src/awsbatch/__init__.py diff --git a/cli/awsbatch/awsbhosts.py b/cli/src/awsbatch/awsbhosts.py similarity index 100% rename from cli/awsbatch/awsbhosts.py rename to cli/src/awsbatch/awsbhosts.py diff --git a/cli/awsbatch/awsbkill.py b/cli/src/awsbatch/awsbkill.py similarity index 100% rename from cli/awsbatch/awsbkill.py rename to cli/src/awsbatch/awsbkill.py diff --git a/cli/awsbatch/awsbout.py b/cli/src/awsbatch/awsbout.py similarity index 100% rename from cli/awsbatch/awsbout.py rename to cli/src/awsbatch/awsbout.py diff --git a/cli/awsbatch/awsbqueues.py b/cli/src/awsbatch/awsbqueues.py similarity index 100% rename from cli/awsbatch/awsbqueues.py rename to cli/src/awsbatch/awsbqueues.py diff --git a/cli/awsbatch/awsbstat.py b/cli/src/awsbatch/awsbstat.py similarity index 100% rename from cli/awsbatch/awsbstat.py rename to cli/src/awsbatch/awsbstat.py diff --git a/cli/awsbatch/awsbsub.py b/cli/src/awsbatch/awsbsub.py similarity index 100% rename from cli/awsbatch/awsbsub.py rename to cli/src/awsbatch/awsbsub.py diff --git a/cli/awsbatch/common.py b/cli/src/awsbatch/common.py similarity index 100% rename from cli/awsbatch/common.py rename to cli/src/awsbatch/common.py diff --git a/cli/awsbatch/examples/awsbatch-cli.cfg b/cli/src/awsbatch/examples/awsbatch-cli.cfg similarity index 100% rename from cli/awsbatch/examples/awsbatch-cli.cfg rename to cli/src/awsbatch/examples/awsbatch-cli.cfg diff --git a/cli/awsbatch/utils.py b/cli/src/awsbatch/utils.py similarity index 100% rename from cli/awsbatch/utils.py rename to cli/src/awsbatch/utils.py diff --git a/cli/pcluster/__init__.py b/cli/src/pcluster/__init__.py similarity index 100% rename from cli/pcluster/__init__.py rename to cli/src/pcluster/__init__.py diff --git a/cli/pcluster/cli.py b/cli/src/pcluster/cli.py similarity index 100% rename from cli/pcluster/cli.py rename to cli/src/pcluster/cli.py diff --git a/cli/pcluster/cli_commands/__init__.py b/cli/src/pcluster/cli_commands/__init__.py similarity index 100% rename from cli/pcluster/cli_commands/__init__.py rename to cli/src/pcluster/cli_commands/__init__.py diff --git a/cli/pcluster/cli_commands/compute_fleet_status_manager.py b/cli/src/pcluster/cli_commands/compute_fleet_status_manager.py similarity index 100% rename from cli/pcluster/cli_commands/compute_fleet_status_manager.py rename to cli/src/pcluster/cli_commands/compute_fleet_status_manager.py diff --git a/cli/pcluster/cli_commands/delete.py b/cli/src/pcluster/cli_commands/delete.py similarity index 100% rename from cli/pcluster/cli_commands/delete.py rename to cli/src/pcluster/cli_commands/delete.py diff --git a/cli/pcluster/cli_commands/start.py b/cli/src/pcluster/cli_commands/start.py similarity index 100% rename from cli/pcluster/cli_commands/start.py rename to cli/src/pcluster/cli_commands/start.py diff --git a/cli/pcluster/cli_commands/stop.py b/cli/src/pcluster/cli_commands/stop.py similarity index 100% rename from cli/pcluster/cli_commands/stop.py rename to cli/src/pcluster/cli_commands/stop.py diff --git a/cli/pcluster/cli_commands/update.py b/cli/src/pcluster/cli_commands/update.py similarity index 100% rename from cli/pcluster/cli_commands/update.py rename to cli/src/pcluster/cli_commands/update.py diff --git a/cli/pcluster/cluster_model.py b/cli/src/pcluster/cluster_model.py similarity index 100% rename from cli/pcluster/cluster_model.py rename to cli/src/pcluster/cluster_model.py diff --git a/cli/pcluster/commands.py b/cli/src/pcluster/commands.py similarity index 100% rename from cli/pcluster/commands.py rename to cli/src/pcluster/commands.py diff --git a/cli/pcluster/config/__init__.py b/cli/src/pcluster/config/__init__.py similarity index 100% rename from cli/pcluster/config/__init__.py rename to cli/src/pcluster/config/__init__.py diff --git a/cli/pcluster/config/cfn_param_types.py b/cli/src/pcluster/config/cfn_param_types.py similarity index 100% rename from cli/pcluster/config/cfn_param_types.py rename to cli/src/pcluster/config/cfn_param_types.py diff --git a/cli/pcluster/config/config_patch.py b/cli/src/pcluster/config/config_patch.py similarity index 100% rename from cli/pcluster/config/config_patch.py rename to cli/src/pcluster/config/config_patch.py diff --git a/cli/pcluster/config/hit_converter.py b/cli/src/pcluster/config/hit_converter.py similarity index 100% rename from cli/pcluster/config/hit_converter.py rename to cli/src/pcluster/config/hit_converter.py diff --git a/cli/pcluster/config/iam_policy_rules.py b/cli/src/pcluster/config/iam_policy_rules.py similarity index 100% rename from cli/pcluster/config/iam_policy_rules.py rename to cli/src/pcluster/config/iam_policy_rules.py diff --git a/cli/pcluster/config/json_param_types.py b/cli/src/pcluster/config/json_param_types.py similarity index 100% rename from cli/pcluster/config/json_param_types.py rename to cli/src/pcluster/config/json_param_types.py diff --git a/cli/pcluster/config/mappings.py b/cli/src/pcluster/config/mappings.py similarity index 100% rename from cli/pcluster/config/mappings.py rename to cli/src/pcluster/config/mappings.py diff --git a/cli/pcluster/config/param_types.py b/cli/src/pcluster/config/param_types.py similarity index 100% rename from cli/pcluster/config/param_types.py rename to cli/src/pcluster/config/param_types.py diff --git a/cli/pcluster/config/pcluster_config.py b/cli/src/pcluster/config/pcluster_config.py similarity index 100% rename from cli/pcluster/config/pcluster_config.py rename to cli/src/pcluster/config/pcluster_config.py diff --git a/cli/pcluster/config/resource_map.py b/cli/src/pcluster/config/resource_map.py similarity index 100% rename from cli/pcluster/config/resource_map.py rename to cli/src/pcluster/config/resource_map.py diff --git a/cli/pcluster/config/update_policy.py b/cli/src/pcluster/config/update_policy.py similarity index 100% rename from cli/pcluster/config/update_policy.py rename to cli/src/pcluster/config/update_policy.py diff --git a/cli/pcluster/config/validators.py b/cli/src/pcluster/config/validators.py similarity index 100% rename from cli/pcluster/config/validators.py rename to cli/src/pcluster/config/validators.py diff --git a/cli/pcluster/configure/__init__.py b/cli/src/pcluster/configure/__init__.py similarity index 100% rename from cli/pcluster/configure/__init__.py rename to cli/src/pcluster/configure/__init__.py diff --git a/cli/pcluster/configure/easyconfig.py b/cli/src/pcluster/configure/easyconfig.py similarity index 99% rename from cli/pcluster/configure/easyconfig.py rename to cli/src/pcluster/configure/easyconfig.py index e1318b0d00..362d548036 100644 --- a/cli/pcluster/configure/easyconfig.py +++ b/cli/src/pcluster/configure/easyconfig.py @@ -11,11 +11,6 @@ # fmt: off from __future__ import absolute_import, print_function # isort:skip from future import standard_library # isort:skip - -from pcluster.cluster_model import ClusterModel -from pcluster.config.hit_converter import HitConverter -from pcluster.config.validators import HEAD_NODE_UNSUPPORTED_INSTANCE_TYPES, HEAD_NODE_UNSUPPORTED_MESSAGE - standard_library.install_aliases() # fmt: on @@ -26,7 +21,10 @@ import boto3 +from pcluster.cluster_model import ClusterModel +from pcluster.config.hit_converter import HitConverter from pcluster.config.pcluster_config import PclusterConfig +from pcluster.config.validators import HEAD_NODE_UNSUPPORTED_INSTANCE_TYPES, HEAD_NODE_UNSUPPORTED_MESSAGE from pcluster.configure.networking import ( NetworkConfiguration, PublicPrivateNetworkConfig, diff --git a/cli/pcluster/configure/networking.py b/cli/src/pcluster/configure/networking.py similarity index 100% rename from cli/pcluster/configure/networking.py rename to cli/src/pcluster/configure/networking.py diff --git a/cli/pcluster/configure/subnet_computation.py b/cli/src/pcluster/configure/subnet_computation.py similarity index 100% rename from cli/pcluster/configure/subnet_computation.py rename to cli/src/pcluster/configure/subnet_computation.py diff --git a/cli/pcluster/configure/utils.py b/cli/src/pcluster/configure/utils.py similarity index 100% rename from cli/pcluster/configure/utils.py rename to cli/src/pcluster/configure/utils.py diff --git a/cli/pcluster/constants.py b/cli/src/pcluster/constants.py similarity index 100% rename from cli/pcluster/constants.py rename to cli/src/pcluster/constants.py diff --git a/cli/pcluster/createami.py b/cli/src/pcluster/createami.py similarity index 100% rename from cli/pcluster/createami.py rename to cli/src/pcluster/createami.py diff --git a/cli/pcluster/dcv/__init__.py b/cli/src/pcluster/dcv/__init__.py similarity index 100% rename from cli/pcluster/dcv/__init__.py rename to cli/src/pcluster/dcv/__init__.py diff --git a/cli/pcluster/dcv/connect.py b/cli/src/pcluster/dcv/connect.py similarity index 100% rename from cli/pcluster/dcv/connect.py rename to cli/src/pcluster/dcv/connect.py diff --git a/cli/pcluster/dcv/utils.py b/cli/src/pcluster/dcv/utils.py similarity index 100% rename from cli/pcluster/dcv/utils.py rename to cli/src/pcluster/dcv/utils.py diff --git a/cli/pcluster/examples/config b/cli/src/pcluster/examples/config similarity index 100% rename from cli/pcluster/examples/config rename to cli/src/pcluster/examples/config diff --git a/cli/pcluster/examples/job.sh b/cli/src/pcluster/examples/job.sh similarity index 100% rename from cli/pcluster/examples/job.sh rename to cli/src/pcluster/examples/job.sh diff --git a/cli/pcluster/models/__init__.py b/cli/src/pcluster/models/__init__.py similarity index 100% rename from cli/pcluster/models/__init__.py rename to cli/src/pcluster/models/__init__.py diff --git a/cli/pcluster/models/hit/__init__.py b/cli/src/pcluster/models/hit/__init__.py similarity index 100% rename from cli/pcluster/models/hit/__init__.py rename to cli/src/pcluster/models/hit/__init__.py diff --git a/cli/pcluster/models/hit/hit_cluster_model.py b/cli/src/pcluster/models/hit/hit_cluster_model.py similarity index 100% rename from cli/pcluster/models/hit/hit_cluster_model.py rename to cli/src/pcluster/models/hit/hit_cluster_model.py diff --git a/cli/pcluster/models/sit/__init__.py b/cli/src/pcluster/models/sit/__init__.py similarity index 100% rename from cli/pcluster/models/sit/__init__.py rename to cli/src/pcluster/models/sit/__init__.py diff --git a/cli/pcluster/models/sit/sit_cluster_model.py b/cli/src/pcluster/models/sit/sit_cluster_model.py similarity index 100% rename from cli/pcluster/models/sit/sit_cluster_model.py rename to cli/src/pcluster/models/sit/sit_cluster_model.py diff --git a/cli/pcluster/networking/__init__.py b/cli/src/pcluster/networking/__init__.py similarity index 100% rename from cli/pcluster/networking/__init__.py rename to cli/src/pcluster/networking/__init__.py diff --git a/cli/pcluster/networking/vpc_factory.py b/cli/src/pcluster/networking/vpc_factory.py similarity index 100% rename from cli/pcluster/networking/vpc_factory.py rename to cli/src/pcluster/networking/vpc_factory.py diff --git a/cli/pcluster/resources/batch/docker/alinux b/cli/src/pcluster/resources/batch/docker/alinux similarity index 100% rename from cli/pcluster/resources/batch/docker/alinux rename to cli/src/pcluster/resources/batch/docker/alinux diff --git a/cli/pcluster/resources/batch/docker/alinux2/Dockerfile b/cli/src/pcluster/resources/batch/docker/alinux2/Dockerfile similarity index 100% rename from cli/pcluster/resources/batch/docker/alinux2/Dockerfile rename to cli/src/pcluster/resources/batch/docker/alinux2/Dockerfile diff --git a/cli/pcluster/resources/batch/docker/build-docker-images.sh b/cli/src/pcluster/resources/batch/docker/build-docker-images.sh similarity index 100% rename from cli/pcluster/resources/batch/docker/build-docker-images.sh rename to cli/src/pcluster/resources/batch/docker/build-docker-images.sh diff --git a/cli/pcluster/resources/batch/docker/buildspec.yml b/cli/src/pcluster/resources/batch/docker/buildspec.yml similarity index 100% rename from cli/pcluster/resources/batch/docker/buildspec.yml rename to cli/src/pcluster/resources/batch/docker/buildspec.yml diff --git a/cli/pcluster/resources/batch/docker/scripts/entrypoint.sh b/cli/src/pcluster/resources/batch/docker/scripts/entrypoint.sh similarity index 100% rename from cli/pcluster/resources/batch/docker/scripts/entrypoint.sh rename to cli/src/pcluster/resources/batch/docker/scripts/entrypoint.sh diff --git a/cli/pcluster/resources/batch/docker/scripts/generate_hostfile.sh b/cli/src/pcluster/resources/batch/docker/scripts/generate_hostfile.sh similarity index 100% rename from cli/pcluster/resources/batch/docker/scripts/generate_hostfile.sh rename to cli/src/pcluster/resources/batch/docker/scripts/generate_hostfile.sh diff --git a/cli/pcluster/resources/batch/docker/scripts/mount_efs.sh b/cli/src/pcluster/resources/batch/docker/scripts/mount_efs.sh similarity index 100% rename from cli/pcluster/resources/batch/docker/scripts/mount_efs.sh rename to cli/src/pcluster/resources/batch/docker/scripts/mount_efs.sh diff --git a/cli/pcluster/resources/batch/docker/scripts/mount_nfs.sh b/cli/src/pcluster/resources/batch/docker/scripts/mount_nfs.sh similarity index 100% rename from cli/pcluster/resources/batch/docker/scripts/mount_nfs.sh rename to cli/src/pcluster/resources/batch/docker/scripts/mount_nfs.sh diff --git a/cli/pcluster/resources/batch/docker/upload-docker-images.sh b/cli/src/pcluster/resources/batch/docker/upload-docker-images.sh similarity index 100% rename from cli/pcluster/resources/batch/docker/upload-docker-images.sh rename to cli/src/pcluster/resources/batch/docker/upload-docker-images.sh diff --git a/cli/pcluster/resources/custom_resources/custom_resources_code/__init__.py b/cli/src/pcluster/resources/custom_resources/custom_resources_code/__init__.py similarity index 100% rename from cli/pcluster/resources/custom_resources/custom_resources_code/__init__.py rename to cli/src/pcluster/resources/custom_resources/custom_resources_code/__init__.py diff --git a/cli/pcluster/resources/custom_resources/custom_resources_code/cleanup_resources.py b/cli/src/pcluster/resources/custom_resources/custom_resources_code/cleanup_resources.py similarity index 100% rename from cli/pcluster/resources/custom_resources/custom_resources_code/cleanup_resources.py rename to cli/src/pcluster/resources/custom_resources/custom_resources_code/cleanup_resources.py diff --git a/cli/pcluster/resources/custom_resources/custom_resources_code/crhelper/LICENSE b/cli/src/pcluster/resources/custom_resources/custom_resources_code/crhelper/LICENSE similarity index 100% rename from cli/pcluster/resources/custom_resources/custom_resources_code/crhelper/LICENSE rename to cli/src/pcluster/resources/custom_resources/custom_resources_code/crhelper/LICENSE diff --git a/cli/pcluster/resources/custom_resources/custom_resources_code/crhelper/NOTICE b/cli/src/pcluster/resources/custom_resources/custom_resources_code/crhelper/NOTICE similarity index 100% rename from cli/pcluster/resources/custom_resources/custom_resources_code/crhelper/NOTICE rename to cli/src/pcluster/resources/custom_resources/custom_resources_code/crhelper/NOTICE diff --git a/cli/pcluster/resources/custom_resources/custom_resources_code/crhelper/__init__.py b/cli/src/pcluster/resources/custom_resources/custom_resources_code/crhelper/__init__.py similarity index 100% rename from cli/pcluster/resources/custom_resources/custom_resources_code/crhelper/__init__.py rename to cli/src/pcluster/resources/custom_resources/custom_resources_code/crhelper/__init__.py diff --git a/cli/pcluster/resources/custom_resources/custom_resources_code/crhelper/log_helper.py b/cli/src/pcluster/resources/custom_resources/custom_resources_code/crhelper/log_helper.py similarity index 100% rename from cli/pcluster/resources/custom_resources/custom_resources_code/crhelper/log_helper.py rename to cli/src/pcluster/resources/custom_resources/custom_resources_code/crhelper/log_helper.py diff --git a/cli/pcluster/resources/custom_resources/custom_resources_code/crhelper/resource_helper.py b/cli/src/pcluster/resources/custom_resources/custom_resources_code/crhelper/resource_helper.py similarity index 100% rename from cli/pcluster/resources/custom_resources/custom_resources_code/crhelper/resource_helper.py rename to cli/src/pcluster/resources/custom_resources/custom_resources_code/crhelper/resource_helper.py diff --git a/cli/pcluster/resources/custom_resources/custom_resources_code/crhelper/utils.py b/cli/src/pcluster/resources/custom_resources/custom_resources_code/crhelper/utils.py similarity index 100% rename from cli/pcluster/resources/custom_resources/custom_resources_code/crhelper/utils.py rename to cli/src/pcluster/resources/custom_resources/custom_resources_code/crhelper/utils.py diff --git a/cli/pcluster/resources/custom_resources/custom_resources_code/manage_docker_images.py b/cli/src/pcluster/resources/custom_resources/custom_resources_code/manage_docker_images.py similarity index 100% rename from cli/pcluster/resources/custom_resources/custom_resources_code/manage_docker_images.py rename to cli/src/pcluster/resources/custom_resources/custom_resources_code/manage_docker_images.py diff --git a/cli/pcluster/resources/custom_resources/custom_resources_code/send_build_notification.py b/cli/src/pcluster/resources/custom_resources/custom_resources_code/send_build_notification.py similarity index 100% rename from cli/pcluster/resources/custom_resources/custom_resources_code/send_build_notification.py rename to cli/src/pcluster/resources/custom_resources/custom_resources_code/send_build_notification.py diff --git a/cli/pcluster/resources/custom_resources/custom_resources_code/wait_for_update.py b/cli/src/pcluster/resources/custom_resources/custom_resources_code/wait_for_update.py similarity index 100% rename from cli/pcluster/resources/custom_resources/custom_resources_code/wait_for_update.py rename to cli/src/pcluster/resources/custom_resources/custom_resources_code/wait_for_update.py diff --git a/cli/pcluster/utils.py b/cli/src/pcluster/utils.py similarity index 100% rename from cli/pcluster/utils.py rename to cli/src/pcluster/utils.py diff --git a/cli/pcluster_config/__init__.py b/cli/src/pcluster_config/__init__.py similarity index 100% rename from cli/pcluster_config/__init__.py rename to cli/src/pcluster_config/__init__.py diff --git a/cli/pcluster_config/cli.py b/cli/src/pcluster_config/cli.py similarity index 97% rename from cli/pcluster_config/cli.py rename to cli/src/pcluster_config/cli.py index 6befd5e2ac..bc5f1a4cd0 100644 --- a/cli/pcluster_config/cli.py +++ b/cli/src/pcluster_config/cli.py @@ -12,10 +12,11 @@ # See the License for the specific language governing permissions and limitations under the License. # -import argparse import os import sys +import argparse + from pcluster.config.hit_converter import HitConverter from pcluster.config.pcluster_config import PclusterConfig, default_config_file_path @@ -31,9 +32,7 @@ def _parse_args(argv=None): default_config_file = default_config_file_path() parser = argparse.ArgumentParser( - description=( - "Updates the AWS ParallelCluster configuration file." - ), + description=("Updates the AWS ParallelCluster configuration file."), epilog='For command specific flags, please run: "pcluster-config [command] --help"', ) subparsers = parser.add_subparsers() diff --git a/cli/tests/conftest.py b/cli/tests/conftest.py index 76c854e1e1..e1c4fbc8f3 100644 --- a/cli/tests/conftest.py +++ b/cli/tests/conftest.py @@ -55,7 +55,7 @@ def test_datadir(request, datadir): @pytest.fixture() def convert_to_date_mock(request, mocker): """Mock convert_to_date function by enforcing the timezone to UTC.""" - module_under_test = request.module.__name__.replace("test_", "") + module_under_test = request.node.fspath.purebasename.replace("test_", "") def _convert_to_date_utc(*args, **kwargs): from dateutil import tz @@ -147,7 +147,7 @@ def _boto3_stubber(service, mocked_requests): @pytest.fixture() def awsbatchcliconfig_mock(request, mocker): """Mock AWSBatchCliConfig object with a default mock.""" - module_under_test = request.module.__name__.replace("test_", "") + module_under_test = request.node.fspath.purebasename.replace("test_", "") mock = mocker.patch("awsbatch." + module_under_test + ".AWSBatchCliConfig", autospec=True) for key, value in DEFAULT_AWSBATCHCLICONFIG_MOCK_CONFIG.items(): setattr(mock.return_value, key, value) diff --git a/cli/tests/pcluster/config/utils.py b/cli/tests/pcluster/config/utils.py index 61835ba1f6..fe654cc7aa 100644 --- a/cli/tests/pcluster/config/utils.py +++ b/cli/tests/pcluster/config/utils.py @@ -46,7 +46,7 @@ def merge_dicts(*args): def get_pcluster_config_example(): current_dir = os.path.dirname(os.path.abspath(__file__)) - return os.path.join(current_dir, "..", "..", "..", "pcluster", "examples", "config") + return os.path.join(current_dir, "..", "..", "..", "src", "pcluster", "examples", "config") def set_default_values_for_required_cluster_section_params(cluster_section_dict, only_if_not_present=False): diff --git a/cli/tests/pcluster/configure/__init__.py b/cli/tests/pcluster/configure/__init__.py new file mode 100644 index 0000000000..492c81bc88 --- /dev/null +++ b/cli/tests/pcluster/configure/__init__.py @@ -0,0 +1,10 @@ +# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance +# with the License. A copy of the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES +# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and +# limitations under the License. diff --git a/cli/tests/pcluster/createami/__init__.py b/cli/tests/pcluster/createami/__init__.py new file mode 100644 index 0000000000..492c81bc88 --- /dev/null +++ b/cli/tests/pcluster/createami/__init__.py @@ -0,0 +1,10 @@ +# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance +# with the License. A copy of the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES +# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and +# limitations under the License. diff --git a/cli/tox.ini b/cli/tox.ini index e6ce3fcb09..061f9f140d 100644 --- a/cli/tox.ini +++ b/cli/tox.ini @@ -1,29 +1,35 @@ [tox] toxworkdir=../.tox envlist = - py{27,34,35,36,37,38} + py{27,34,35,36,37,38,39}-{cov,nocov} code-linters cfn-{tests,lint,format-check} # Default testenv. Used to run tests on all python versions. [testenv] -passenv = CI TRAVIS_BUILD_ID TRAVIS TRAVIS_BRANCH TRAVIS_JOB_NUMBER TRAVIS_PULL_REQUEST TRAVIS_JOB_ID TRAVIS_REPO_SLUG TRAVIS_COMMIT +passenv = + CI TRAVIS_BUILD_ID TRAVIS TRAVIS_BRANCH TRAVIS_JOB_NUMBER TRAVIS_PULL_REQUEST TRAVIS_JOB_ID TRAVIS_REPO_SLUG TRAVIS_COMMIT +usedevelop = + cov: true + nocov: false whitelist_externals = bash deps = -rtests/requirements.txt - py38: codecov + pytest-travis-fold + cov: codecov commands = bash ./tests/pcluster/test.sh - py{27,34,35,36,37,38}: py.test -l -v --basetemp={envtmpdir} --html=report.html --cov=awsbatch --cov=pcluster tests/ - py38: codecov -e TOXENV + nocov: pytest -l -v --basetemp={envtmpdir} --html=report.html --ignore=src tests/ + cov: python setup.py clean --all build_ext --force --inplace + cov: pytest -l -v --basetemp={envtmpdir} --html=report.html --cov=src tests/ + cov: codecov -e TOXENV # Section used to define common variables used by multiple testenvs. [vars] code_dirs = setup.py \ - awsbatch/ \ - pcluster/ \ + src/ \ tests/ \ ../cloudformation/ \ ../tests/ \ @@ -107,8 +113,7 @@ deps = commands = flake8 \ setup.py \ - awsbatch/ \ - pcluster/ \ + src/ \ tests/ \ ../cloudformation/ \ ../tests/integration-tests/ \ @@ -124,8 +129,7 @@ deps = commands = bandit -r \ setup.py \ - awsbatch/ \ - pcluster/ \ + src/ \ ../util/ \ {posargs} @@ -148,9 +152,7 @@ deps = commands = pylint \ setup.py \ - awsbatch/ \ - pcluster/ \ - pcluster/resources/batch/custom_resources_code \ + src/ \ ../util/ \ {posargs} @@ -162,9 +164,7 @@ commands = --disable=all \ --enable=no-value-for-parameter \ setup.py \ - awsbatch/ \ - pcluster/ \ - pcluster/resources/custom_resources/custom_resources_code/ \ + src \ ../util/ \ {posargs} @@ -177,8 +177,7 @@ deps = commands = vulture \ setup.py \ - awsbatch/ \ - pcluster/ \ + src/ \ ../util/ \ {posargs} From 052e11bf29031bbf4eea14dcff21b33494b180ac Mon Sep 17 00:00:00 2001 From: Luca Carrogu Date: Mon, 9 Nov 2020 15:06:39 +0100 Subject: [PATCH 04/66] Configuration file for tests on released Common tests with develop are put in common.yaml, included through jinja Signed-off-by: Luca Carrogu --- .../configs/common/common.yaml | 507 +++++++++++++++++ tests/integration-tests/configs/develop.yaml | 509 +----------------- tests/integration-tests/configs/released.yaml | 6 + 3 files changed, 516 insertions(+), 506 deletions(-) create mode 100644 tests/integration-tests/configs/common/common.yaml create mode 100644 tests/integration-tests/configs/released.yaml diff --git a/tests/integration-tests/configs/common/common.yaml b/tests/integration-tests/configs/common/common.yaml new file mode 100644 index 0000000000..64d8f41553 --- /dev/null +++ b/tests/integration-tests/configs/common/common.yaml @@ -0,0 +1,507 @@ +cfn-init: + test_cfn_init.py::test_replace_compute_on_failure: + dimensions: + - regions: ["eu-central-1"] + instances: {{ common.INSTANCES_DEFAULT_X86 }} + oss: {{ common.OSS_ONE_PER_DISTRO }} + schedulers: ["slurm", "sge"] + test_cfn_init.py::test_install_args_quotes: + dimensions: + - regions: ["us-west-1"] + instances: {{ common.INSTANCES_DEFAULT_X86 }} + oss: ["centos7"] + schedulers: ["slurm"] +cli_commands: + test_cli_commands.py::test_hit_cli_commands: + dimensions: + - regions: ["ap-northeast-2"] + instances: {{ common.INSTANCES_DEFAULT_X86 }} + oss: ["ubuntu1804"] + schedulers: ["slurm"] + test_cli_commands.py::test_sit_cli_commands: + dimensions: + - regions: ["us-west-2"] + instances: {{ common.INSTANCES_DEFAULT_X86 }} + oss: ["centos7"] + schedulers: ["sge"] +cloudwatch_logging: + test_cloudwatch_logging.py::test_cloudwatch_logging: + dimensions: + # 1) run the test for all of the schedulers with alinux2 + - regions: ["cn-northwest-1"] + instances: {{ common.INSTANCES_DEFAULT_X86 }} + oss: ["alinux2"] + schedulers: {{ common.SCHEDULERS_TRAD }} + - regions: ["us-gov-east-1"] + instances: {{ common.INSTANCES_DEFAULT_X86 }} + oss: ["alinux2"] + schedulers: ["awsbatch"] + # 2) run the test for all of the OSes with slurm + - regions: ["ap-east-1"] + instances: {{ common.INSTANCES_DEFAULT_X86 }} + oss: {{ common.OSS_COMMERCIAL_X86 }} + schedulers: ["slurm"] + # 3) run the test for a single scheduler-OS combination on an ARM instance + - regions: ["eu-west-1"] + instances: {{ common.INSTANCES_DEFAULT_ARM }} + oss: ["alinux2"] + schedulers: ["slurm"] +configure: + test_pcluster_configure.py::test_pcluster_configure: + dimensions: + - regions: ["ap-southeast-1"] + instances: {{ common.INSTANCES_DEFAULT_X86 }} + oss: {{ common.OSS_ONE_PER_DISTRO }} + schedulers: ["slurm", "sge"] + - regions: ["us-east-1"] + instances: {{ common.INSTANCES_DEFAULT_ARM }} + oss: ["alinux2"] + schedulers: ["slurm"] + # Do not run on ARM + Batch + # pcluster configure always picks optimal and Batch does not support ARM for optimal for now + - regions: ["us-gov-west-1"] + instances: {{ common.INSTANCES_DEFAULT_X86 }} + oss: ["alinux2"] + schedulers: ["awsbatch"] + test_pcluster_configure.py::test_pcluster_configure_avoid_bad_subnets: + dimensions: + - regions: ["us-east-1"] # region must be us-east-1 due to hardcoded logic for AZ selection + instances: {{ common.INSTANCES_DEFAULT_X86 }} + oss: ["alinux2"] + schedulers: ["slurm"] +create: + test_create.py::test_create_wrong_os: + dimensions: + - regions: ["eu-central-1"] + instances: {{ common.INSTANCES_DEFAULT_X86 }} + oss: ["ubuntu1804"] # os must be different from centos7 to test os validation logic when wrong os is provided + schedulers: ["slurm"] + test_create.py::test_create_wrong_pcluster_version: + dimensions: + - regions: ["ca-central-1"] + instances: {{ common.INSTANCES_DEFAULT_X86 }} + oss: ["alinux"] + schedulers: ["slurm"] +createami: + test_createami.py::test_createami: + dimensions: + - regions: ["eu-west-3"] + instances: {{ common.INSTANCES_DEFAULT_X86 }} + oss: ["alinux", "alinux2", "ubuntu1604", "ubuntu1804"] # temporary disable FPGA AMI since there is not enough free space on root partition + - regions: ["us-gov-east-1"] + instances: {{ common.INSTANCES_DEFAULT_X86 }} + oss: ["ubuntu1604", "ubuntu1804"] + - regions: ["cn-northwest-1"] + instances: {{ common.INSTANCES_DEFAULT_X86 }} + oss: ["alinux2"] + test_createami.py::test_createami_post_install: + dimensions: + - regions: ["ap-southeast-2"] + instances: {{ common.INSTANCES_DEFAULT_X86 }} + oss: ["centos7", "ubuntu1804"] + - regions: ["eu-west-1"] + instances: {{ common.INSTANCES_DEFAULT_ARM }} + oss: ["alinux2"] + test_createami.py::test_createami_wrong_os: + dimensions: + - regions: ["eu-central-1"] + instances: {{ common.INSTANCES_DEFAULT_X86 }} + oss: ["alinux"] # os must be different from alinux2 to test os validation logic when wrong os is provided + test_createami.py::test_createami_wrong_pcluster_version: + dimensions: + - regions: ["ca-central-1"] + instances: {{ common.INSTANCES_DEFAULT_X86 }} + oss: ["alinux"] +dashboard: + test_dashboard.py::test_dashboard: + dimensions: + - regions: ["ap-northeast-2"] + instances: {{ common.INSTANCES_DEFAULT_X86 }} + oss: ["centos7"] + schedulers: ["slurm"] +dcv: + test_dcv.py::test_dcv_configuration: + dimensions: + # DCV on GPU enabled instance + - regions: ["eu-west-1"] + instances: ["g3.8xlarge"] + oss: ["alinux2", "centos7", "ubuntu1804"] + schedulers: ["slurm"] + # DCV on ARM + - regions: ["eu-west-1"] + instances: {{ common.INSTANCES_DEFAULT_ARM }} + oss: ["alinux2", "ubuntu1804"] + schedulers: ["slurm"] + # DCV in cn regions and non GPU enabled instance + - regions: ["cn-northwest-1"] + instances: {{ common.INSTANCES_DEFAULT_X86 }} + oss: ["alinux2"] + schedulers: ["slurm"] + # DCV in gov-cloud regions and non GPU enabled instance + - regions: ["us-gov-west-1"] + instances: {{ common.INSTANCES_DEFAULT_X86 }} + oss: ["ubuntu1804"] + schedulers: ["slurm"] + test_dcv.py::test_dcv_with_remote_access: + dimensions: + - regions: ["ap-southeast-2"] + instances: {{ common.INSTANCES_DEFAULT_X86 }} + oss: ["centos7", "centos8"] + schedulers: ["sge"] +disable_hyperthreading: + test_disable_hyperthreading.py::test_hit_disable_hyperthreading: + dimensions: + # Manually disabled HT + - regions: ["us-west-1"] + instances: ["m4.xlarge"] + oss: ["alinux2", "centos7", "ubuntu1604"] + schedulers: ["slurm"] + # HT disabled via CpuOptions + - regions: ["us-west-1"] + instances: ["c5.xlarge"] + oss: ["ubuntu1804"] + schedulers: ["slurm"] + test_disable_hyperthreading.py::test_sit_disable_hyperthreading: + dimensions: + # Manually disabled HT + {%- for os, scheduler in [("alinux", "sge"), ("centos7", "torque"), ("ubuntu1804", "sge")] %} + - regions: ["sa-east-1"] + instances: ["m4.xlarge"] + oss: ["{{ os }}"] + schedulers: ["{{ scheduler }}"] + {%- endfor %} + # HT disabled via CpuOptions + {%- for os, scheduler in [("alinux2", "sge"), ("centos7", "torque")] %} + - regions: ["sa-east-1"] + instances: ["c5.xlarge"] + oss: ["{{ os }}"] + schedulers: ["{{ scheduler }}"] + {%- endfor %} +dns: + test_dns.py::test_hit_no_cluster_dns_mpi: + dimensions: + - regions: ["eu-west-2"] + instances: {{ common.INSTANCES_DEFAULT_X86 }} + oss: {{ common.OSS_COMMERCIAL_X86 }} + schedulers: ["slurm"] +efa: + test_efa.py::test_hit_efa: + dimensions: + - regions: ["us-east-1"] + instances: ["c5n.18xlarge"] + oss: ["alinux2"] + schedulers: ["slurm"] + - regions: ["us-east-1"] + instances: ["p4d.24xlarge"] + oss: ["alinux2", "ubuntu1604", "centos8"] + schedulers: ["slurm"] + test_efa.py::test_sit_efa: + dimensions: + - regions: ["us-east-1"] + instances: ["c5n.18xlarge"] + oss: {{ common.OSS_COMMERCIAL_X86 }} + # Torque is not supported by OpenMPI distributed with EFA + # Slurm test is to verify EFA works correctly when using the SIT model in the config file + schedulers: ["sge", "slurm"] + # P4d instances are currently not supported in SIT clusters + - regions: ["us-east-1"] + instances: ["p4d.24xlarge"] + oss: ["alinux", "ubuntu1804", "centos7"] + schedulers: ["slurm"] +iam_policies: + test_iam_policies.py::test_iam_policies: + dimensions: + - regions: ["eu-north-1"] + instances: {{ common.INSTANCES_DEFAULT_X86 }} + oss: ["alinux2"] + schedulers: ["slurm", "awsbatch"] +intel_hpc: + test_intel_hpc.py::test_intel_hpc: + dimensions: + - regions: ["us-east-1"] + instances: ["c5n.18xlarge"] + oss: ["centos7", "centos8"] + schedulers: ["slurm"] +networking: + test_cluster_networking.py::test_cluster_in_private_subnet: + dimensions: + - regions: ["us-west-2"] + instances: {{ common.INSTANCES_DEFAULT_X86 }} + oss: ["alinux2"] + schedulers: ["slurm"] + - regions: ["eu-west-2"] + instances: {{ common.INSTANCES_DEFAULT_X86 }} + oss: ["centos7"] + schedulers: ["sge"] + test_networking.py::test_public_network_topology: + dimensions: + - regions: ["eu-central-1", "us-gov-east-1", "cn-northwest-1"] + test_networking.py::test_public_private_network_topology: + dimensions: + - regions: ["eu-central-1", "us-gov-east-1", "cn-northwest-1"] + test_multi_cidr.py::test_multi_cidr: + dimensions: + - regions: ["ap-northeast-2"] + instances: {{ common.INSTANCES_DEFAULT_X86 }} + oss: ["alinux2"] + schedulers: ["slurm", "awsbatch"] +scaling: + test_scaling.py::test_hit_scaling: + dimensions: + - regions: ["us-west-1"] + instances: {{ common.INSTANCES_DEFAULT_X86 }} + oss: {{ common.OSS_ONE_PER_DISTRO }} + schedulers: ["slurm"] + test_scaling.py::test_nodewatcher_terminates_failing_node: + dimensions: + - regions: ["sa-east-1"] + instances: {{ common.INSTANCES_DEFAULT_X86 }} + oss: {{ common.OSS_ONE_PER_DISTRO }} + schedulers: ["sge", "torque"] + test_mpi.py::test_mpi: # TODO: move outside of the scaling dir + dimensions: + - regions: ["ap-east-1"] + instances: {{ common.INSTANCES_DEFAULT_X86 }} + oss: {{ common.OSS_COMMERCIAL_X86 }} + schedulers: ["slurm", "sge"] + - regions: ["us-east-1"] + instances: {{ common.INSTANCES_DEFAULT_ARM }} + oss: {{ common.OSS_COMMERCIAL_ARM }} + schedulers: ["slurm", "sge"] + test_mpi.py::test_mpi_ssh: + dimensions: + - regions: ["eu-north-1"] + instances: {{ common.INSTANCES_DEFAULT_X86 }} + oss: {{ common.OSS_COMMERCIAL_X86 }} + schedulers: ["slurm"] + - regions: ["ca-central-1"] + instances: {{ common.INSTANCES_DEFAULT_X86 }} + oss: {{ common.OSS_COMMERCIAL_X86 }} + schedulers: ["sge"] +schedulers: + test_sge.py::test_sge: + dimensions: + - regions: ["eu-central-1"] + instances: {{ common.INSTANCES_DEFAULT_X86 }} + oss: {{ common.OSS_COMMERCIAL_X86 }} + schedulers: ["sge"] + - regions: ["eu-central-1"] + instances: {{ common.INSTANCES_DEFAULT_ARM }} + oss: {{ common.OSS_COMMERCIAL_ARM }} + schedulers: ["sge"] + test_torque.py::test_torque: + dimensions: + - regions: ["ca-central-1"] + instances: {{ common.INSTANCES_DEFAULT_X86 }} + oss: {{ common.OSS_COMMERCIAL_X86 }} + schedulers: ["torque"] + - regions: ["ap-northeast-1"] + instances: {{ common.INSTANCES_DEFAULT_ARM }} + oss: {{ common.OSS_COMMERCIAL_ARM }} + schedulers: ["torque"] + test_awsbatch.py::test_awsbatch: + dimensions: + - regions: ["eu-north-1", "us-gov-west-1", "cn-north-1"] + instances: {{ common.INSTANCES_DEFAULT_X86 }} + oss: ["alinux2"] + schedulers: ["awsbatch"] + - regions: ["ap-southeast-1"] + instances: {{ common.INSTANCES_DEFAULT_X86 }} + oss: ["alinux"] + schedulers: ["awsbatch"] + - regions: ["ap-northeast-1"] + instances: {{ common.INSTANCES_DEFAULT_ARM }} + oss: ["alinux2"] + schedulers: ["awsbatch"] + test_slurm.py::test_slurm: + dimensions: + - regions: ["us-east-2"] + instances: {{ common.INSTANCES_DEFAULT_X86 }} + oss: {{ common.OSS_COMMERCIAL_X86 }} + schedulers: ["slurm"] + test_slurm.py::test_slurm_pmix: # TODO: include in main test_slurm to reduce number of created clusters + dimensions: + - regions: ["ap-southeast-2"] + instances: {{ common.INSTANCES_DEFAULT_X86 }} + oss: {{ common.OSS_COMMERCIAL_X86 }} + schedulers: ["slurm"] + - regions: ["ap-northeast-1"] + instances: {{ common.INSTANCES_DEFAULT_ARM }} + oss: {{ common.OSS_COMMERCIAL_ARM }} + schedulers: ["slurm"] +spot: + test_spot.py::test_spot_default: + dimensions: + - regions: ["us-west-2"] + instances: {{ common.INSTANCES_DEFAULT_X86 }} + oss: ["centos7"] + schedulers: ["sge", "slurm"] +storage: + test_fsx_lustre.py::test_fsx_lustre: + dimensions: + - regions: ["us-east-2"] + instances: {{ common.INSTANCES_DEFAULT_X86 }} + oss: ["alinux2", "centos7", "centos8", "ubuntu1604", "ubuntu1804"] + schedulers: ["slurm"] + - regions: ["eu-west-1"] + instances: {{ common.INSTANCES_DEFAULT_ARM }} + # FSx is only supported on ARM instances for Ubuntu 18.04, Amazon Linux 2 and CentOS 8 + oss: ["alinux2", "ubuntu1804", "centos8"] + schedulers: ["slurm"] + - regions: ["cn-north-1"] + instances: {{ common.INSTANCES_DEFAULT_X86 }} + oss: ["alinux"] + schedulers: ["slurm"] + test_fsx_lustre.py::test_fsx_lustre_configuration_options: + dimensions: + - regions: ["us-east-2"] + instances: {{ common.INSTANCES_DEFAULT_X86 }} + oss: ["alinux2"] + schedulers: ["slurm"] + test_fsx_lustre.py::test_fsx_lustre_backup: + dimensions: + - regions: ["us-west-2"] + instances: {{ common.INSTANCES_DEFAULT_X86 }} + oss: {{ common.OSS_ONE_PER_DISTRO }} + schedulers: ["sge"] + - regions: ["us-west-2"] + instances: {{ common.INSTANCES_DEFAULT_ARM }} + # FSx is only supported on ARM instances for Ubuntu 18.04, Amazon Linux 2 and CentOS 8 + oss: ["alinux2", "ubuntu1804", "centos8"] + schedulers: ["sge"] + test_efs.py::test_efs_compute_az: + dimensions: + - regions: ["us-west-1"] + instances: {{ common.INSTANCES_DEFAULT_X86 }} + oss: ["alinux2"] + schedulers: ["slurm", "awsbatch"] + test_efs.py::test_efs_same_az: + dimensions: + - regions: ["ap-northeast-1"] + instances: {{ common.INSTANCES_DEFAULT_X86 }} + oss: {{ common.OSS_COMMERCIAL_X86 }} + schedulers: ["slurm"] + - regions: ["cn-north-1"] + instances: {{ common.INSTANCES_DEFAULT_X86 }} + oss: {{ common.OSS_CHINA_X86 }} + schedulers: ["slurm"] + - regions: ["ap-northeast-1", "cn-north-1"] + instances: {{ common.INSTANCES_DEFAULT_X86 }} + oss: {{ common.OSS_BATCH }} + schedulers: ["awsbatch"] + test_raid.py::test_raid_fault_tolerance_mode: + dimensions: + - regions: ["cn-northwest-1"] + instances: {{ common.INSTANCES_DEFAULT_X86 }} + oss: ["alinux2"] + schedulers: ["slurm"] + test_raid.py::test_raid_performance_mode: + dimensions: + - regions: ["ap-south-1"] + instances: {{ common.INSTANCES_DEFAULT_X86 }} + oss: {{ common.OSS_COMMERCIAL_X86 }} + schedulers: ["sge"] + - regions: ["us-gov-east-1"] + instances: {{ common.INSTANCES_DEFAULT_X86 }} + oss: {{ common.OSS_GOVCLOUD_X86 }} + schedulers: ["sge"] + - regions: ["ap-south-1"] + instances: {{ common.INSTANCES_DEFAULT_X86 }} + oss: {{ common.OSS_BATCH }} + schedulers: ["awsbatch"] + test_ebs.py::test_default_ebs: + dimensions: + - regions: ["cn-northwest-1"] + instances: {{ common.INSTANCES_DEFAULT_X86 }} + oss: ["alinux"] + schedulers: ["slurm"] + test_ebs.py::test_ebs_multiple: + dimensions: + - regions: ["ca-central-1"] + instances: {{ common.INSTANCES_DEFAULT_X86 }} + oss: ["alinux2"] + schedulers: ["awsbatch"] + - regions: ["ca-central-1"] + instances: {{ common.INSTANCES_DEFAULT_X86 }} + oss: ["ubuntu1804"] + schedulers: ["slurm"] + test_ebs.py::test_ebs_single: + dimensions: + {%- for region, oss in [("eu-west-3", common.OSS_COMMERCIAL_X86), ("cn-north-1", common.OSS_CHINA_X86), ("us-gov-west-1", common.OSS_GOVCLOUD_X86)] %} + - regions: ["{{ region }}"] + instances: {{ common.INSTANCES_DEFAULT_X86 }} + oss: {{ oss }} + schedulers: ["sge"] + {%- endfor %} + test_ebs.py::test_ebs_single_empty: + dimensions: + - regions: ["us-gov-east-1"] + instances: {{ common.INSTANCES_DEFAULT_X86 }} + oss: ["ubuntu1604"] + schedulers: ["torque"] + test_ebs.py::test_ebs_snapshot: + dimensions: + - regions: ["ap-northeast-2"] + instances: {{ common.INSTANCES_DEFAULT_X86 }} + oss: ["alinux2"] + schedulers: ["sge"] + - regions: ["cn-northwest-1"] + instances: {{ common.INSTANCES_DEFAULT_X86 }} + oss: ["ubuntu1804"] + schedulers: ["slurm"] +tags: + test_tag_propagation.py::test_tag_propagation: + dimensions: + - regions: ["ap-southeast-1"] + instances: {{ common.INSTANCES_DEFAULT_X86 }} + oss: ["alinux2"] + schedulers: ["slurm", "torque", "awsbatch"] +update: + test_update.py::test_update_awsbatch: + dimensions: + - regions: ["eu-west-1"] + instances: {{ common.INSTANCES_DEFAULT_X86 }} + oss: ["alinux2"] + schedulers: ["awsbatch"] + test_update.py::test_update_hit: + dimensions: + - regions: ["eu-west-1"] + instances: {{ common.INSTANCES_DEFAULT_X86 }} + oss: {{ common.OSS_COMMERCIAL_X86 }} + schedulers: ["slurm"] + test_update.py::test_update_sit: + dimensions: + - regions: ["eu-west-2"] + instances: {{ common.INSTANCES_DEFAULT_X86 }} + oss: ["centos7"] + schedulers: ["sge"] + test_update.py::test_sit_update_compute_instance_disable_ht: + dimensions: + - regions: ["us-west-1"] + instances: {{ common.INSTANCES_DEFAULT_X86 }} + oss: ["centos7"] + schedulers: ["sge"] + test_update.py::test_sit_update_compute_instance_extra_json: + dimensions: + - regions: ["us-west-1"] + instances: {{ common.INSTANCES_DEFAULT_X86 }} + oss: ["centos7"] + schedulers: ["sge"] +multiple_nics: + test_multiple_nics.py::test_multiple_nics: + dimensions: + - regions: ["us-east-1"] + instances: ["p4d.24xlarge"] + oss: ["alinux2", "ubuntu1604", "centos8"] + schedulers: ["slurm"] + - regions: ["us-east-1"] + instances: ["p4d.24xlarge"] + oss: ["alinux", "ubuntu1804", "centos7"] + schedulers: ["slurm"] +resource_bucket: + test_resource_bucket.py::test_resource_bucket: + dimensions: + - regions: ["ap-southeast-1"] + instances: {{ common.INSTANCES_DEFAULT_X86 }} + oss: ["alinux2"] + schedulers: ["slurm", "awsbatch"] \ No newline at end of file diff --git a/tests/integration-tests/configs/develop.yaml b/tests/integration-tests/configs/develop.yaml index e60a6e643f..0eb3fc82b1 100644 --- a/tests/integration-tests/configs/develop.yaml +++ b/tests/integration-tests/configs/develop.yaml @@ -1,253 +1,9 @@ {%- import 'common.jinja2' as common -%} --- test-suites: - cfn-init: - test_cfn_init.py::test_replace_compute_on_failure: - dimensions: - - regions: ["eu-central-1"] - instances: {{ common.INSTANCES_DEFAULT_X86 }} - oss: {{ common.OSS_ONE_PER_DISTRO }} - schedulers: ["slurm", "sge"] - test_cfn_init.py::test_install_args_quotes: - dimensions: - - regions: ["us-west-1"] - instances: {{ common.INSTANCES_DEFAULT_X86 }} - oss: ["centos7"] - schedulers: ["slurm"] - cli_commands: - test_cli_commands.py::test_hit_cli_commands: - dimensions: - - regions: ["ap-northeast-2"] - instances: {{ common.INSTANCES_DEFAULT_X86 }} - oss: ["ubuntu1804"] - schedulers: ["slurm"] - test_cli_commands.py::test_sit_cli_commands: - dimensions: - - regions: ["us-west-2"] - instances: {{ common.INSTANCES_DEFAULT_X86 }} - oss: ["centos7"] - schedulers: ["sge"] - cloudwatch_logging: - test_cloudwatch_logging.py::test_cloudwatch_logging: - dimensions: - # 1) run the test for all of the schedulers with alinux2 - - regions: ["cn-northwest-1"] - instances: {{ common.INSTANCES_DEFAULT_X86 }} - oss: ["alinux2"] - schedulers: {{ common.SCHEDULERS_TRAD }} - - regions: ["us-gov-east-1"] - instances: {{ common.INSTANCES_DEFAULT_X86 }} - oss: ["alinux2"] - schedulers: ["awsbatch"] - # 2) run the test for all of the OSes with slurm - - regions: ["ap-east-1"] - instances: {{ common.INSTANCES_DEFAULT_X86 }} - oss: {{ common.OSS_COMMERCIAL_X86 }} - schedulers: ["slurm"] - # 3) run the test for a single scheduler-OS combination on an ARM instance - - regions: ["eu-west-1"] - instances: {{ common.INSTANCES_DEFAULT_ARM }} - oss: ["alinux2"] - schedulers: ["slurm"] - configure: - test_pcluster_configure.py::test_pcluster_configure: - dimensions: - - regions: ["ap-southeast-1"] - instances: {{ common.INSTANCES_DEFAULT_X86 }} - oss: {{ common.OSS_ONE_PER_DISTRO }} - schedulers: ["slurm", "sge"] - - regions: ["us-east-1"] - instances: {{ common.INSTANCES_DEFAULT_ARM }} - oss: ["alinux2"] - schedulers: ["slurm"] - # Do not run on ARM + Batch - # pcluster configure always picks optimal and Batch does not support ARM for optimal for now - - regions: ["us-gov-west-1"] - instances: {{ common.INSTANCES_DEFAULT_X86 }} - oss: ["alinux2"] - schedulers: ["awsbatch"] - test_pcluster_configure.py::test_pcluster_configure_avoid_bad_subnets: - dimensions: - - regions: ["us-east-1"] # region must be us-east-1 due to hardcoded logic for AZ selection - instances: {{ common.INSTANCES_DEFAULT_X86 }} - oss: ["alinux2"] - schedulers: ["slurm"] - create: - test_create.py::test_create_wrong_os: - dimensions: - - regions: ["eu-central-1"] - instances: {{ common.INSTANCES_DEFAULT_X86 }} - oss: ["ubuntu1804"] # os must be different from centos7 to test os validation logic when wrong os is provided - schedulers: ["slurm"] - test_create.py::test_create_wrong_pcluster_version: - dimensions: - - regions: ["ca-central-1"] - instances: {{ common.INSTANCES_DEFAULT_X86 }} - oss: ["alinux"] - schedulers: ["slurm"] - createami: - test_createami.py::test_createami: - dimensions: - - regions: ["eu-west-3"] - instances: {{ common.INSTANCES_DEFAULT_X86 }} - oss: ["alinux", "alinux2", "ubuntu1604", "ubuntu1804"] # temporary disable FPGA AMI since there is not enough free space on root partition - - regions: ["us-gov-east-1"] - instances: {{ common.INSTANCES_DEFAULT_X86 }} - oss: ["ubuntu1604", "ubuntu1804"] - - regions: ["cn-northwest-1"] - instances: {{ common.INSTANCES_DEFAULT_X86 }} - oss: ["alinux2"] - test_createami.py::test_createami_post_install: - dimensions: - - regions: ["ap-southeast-2"] - instances: {{ common.INSTANCES_DEFAULT_X86 }} - oss: ["centos7", "ubuntu1804"] - - regions: ["eu-west-1"] - instances: {{ common.INSTANCES_DEFAULT_ARM }} - oss: ["alinux2"] - test_createami.py::test_createami_wrong_os: - dimensions: - - regions: ["eu-central-1"] - instances: {{ common.INSTANCES_DEFAULT_X86 }} - oss: ["alinux"] # os must be different from alinux2 to test os validation logic when wrong os is provided - test_createami.py::test_createami_wrong_pcluster_version: - dimensions: - - regions: ["ca-central-1"] - instances: {{ common.INSTANCES_DEFAULT_X86 }} - oss: ["alinux"] - dashboard: - test_dashboard.py::test_dashboard: - dimensions: - - regions: ["ap-northeast-2"] - instances: {{ common.INSTANCES_DEFAULT_X86 }} - oss: ["centos7"] - schedulers: ["slurm"] - dcv: - test_dcv.py::test_dcv_configuration: - dimensions: - # DCV on GPU enabled instance - - regions: ["eu-west-1"] - instances: ["g3.8xlarge"] - oss: ["alinux2", "centos7", "ubuntu1804"] - schedulers: ["slurm"] - # DCV on ARM - - regions: ["eu-west-1"] - instances: {{ common.INSTANCES_DEFAULT_ARM }} - oss: ["alinux2", "ubuntu1804"] - schedulers: ["slurm"] - # DCV in cn regions and non GPU enabled instance - - regions: ["cn-northwest-1"] - instances: {{ common.INSTANCES_DEFAULT_X86 }} - oss: ["alinux2"] - schedulers: ["slurm"] - # DCV in gov-cloud regions and non GPU enabled instance - - regions: ["us-gov-west-1"] - instances: {{ common.INSTANCES_DEFAULT_X86 }} - oss: ["ubuntu1804"] - schedulers: ["slurm"] - test_dcv.py::test_dcv_with_remote_access: - dimensions: - - regions: ["ap-southeast-2"] - instances: {{ common.INSTANCES_DEFAULT_X86 }} - oss: ["centos7", "centos8"] - schedulers: ["sge"] - disable_hyperthreading: - test_disable_hyperthreading.py::test_hit_disable_hyperthreading: - dimensions: - # Manually disabled HT - - regions: ["us-west-1"] - instances: ["m4.xlarge"] - oss: ["alinux2", "centos7", "ubuntu1604"] - schedulers: ["slurm"] - # HT disabled via CpuOptions - - regions: ["us-west-1"] - instances: ["c5.xlarge"] - oss: ["ubuntu1804"] - schedulers: ["slurm"] - test_disable_hyperthreading.py::test_sit_disable_hyperthreading: - dimensions: - # Manually disabled HT - {%- for os, scheduler in [("alinux", "sge"), ("centos7", "torque"), ("ubuntu1804", "sge")] %} - - regions: ["sa-east-1"] - instances: ["m4.xlarge"] - oss: ["{{ os }}"] - schedulers: ["{{ scheduler }}"] - {%- endfor %} - # HT disabled via CpuOptions - {%- for os, scheduler in [("alinux2", "sge"), ("centos7", "torque")] %} - - regions: ["sa-east-1"] - instances: ["c5.xlarge"] - oss: ["{{ os }}"] - schedulers: ["{{ scheduler }}"] - {%- endfor %} - dns: - test_dns.py::test_hit_no_cluster_dns_mpi: - dimensions: - - regions: ["eu-west-2"] - instances: {{ common.INSTANCES_DEFAULT_X86 }} - oss: {{ common.OSS_COMMERCIAL_X86 }} - schedulers: ["slurm"] - efa: - test_efa.py::test_hit_efa: - dimensions: - - regions: ["us-east-1"] - instances: ["c5n.18xlarge"] - oss: ["alinux2"] - schedulers: ["slurm"] - - regions: ["us-east-1"] - instances: ["p4d.24xlarge"] - oss: ["alinux2", "ubuntu1604", "centos8"] - schedulers: ["slurm"] - test_efa.py::test_sit_efa: - dimensions: - - regions: ["us-east-1"] - instances: ["c5n.18xlarge"] - oss: {{ common.OSS_COMMERCIAL_X86 }} - # Torque is not supported by OpenMPI distributed with EFA - # Slurm test is to verify EFA works correctly when using the SIT model in the config file - schedulers: ["sge", "slurm"] - # P4d instances are currently not supported in SIT clusters - - regions: ["us-east-1"] - instances: ["p4d.24xlarge"] - oss: ["alinux", "ubuntu1804", "centos7"] - schedulers: ["slurm"] - iam_policies: - test_iam_policies.py::test_iam_policies: - dimensions: - - regions: ["eu-north-1"] - instances: {{ common.INSTANCES_DEFAULT_X86 }} - oss: ["alinux2"] - schedulers: ["slurm", "awsbatch"] - intel_hpc: - test_intel_hpc.py::test_intel_hpc: - dimensions: - - regions: ["us-east-1"] - instances: ["c5n.18xlarge"] - oss: ["centos7", "centos8"] - schedulers: ["slurm"] - networking: - test_cluster_networking.py::test_cluster_in_private_subnet: - dimensions: - - regions: ["us-west-2"] - instances: {{ common.INSTANCES_DEFAULT_X86 }} - oss: ["alinux2"] - schedulers: ["slurm"] - - regions: ["eu-west-2"] - instances: {{ common.INSTANCES_DEFAULT_X86 }} - oss: ["centos7"] - schedulers: ["sge"] - test_networking.py::test_public_network_topology: - dimensions: - - regions: ["eu-central-1", "us-gov-east-1", "cn-northwest-1"] - test_networking.py::test_public_private_network_topology: - dimensions: - - regions: ["eu-central-1", "us-gov-east-1", "cn-northwest-1"] - test_multi_cidr.py::test_multi_cidr: - dimensions: - - regions: ["ap-northeast-2"] - instances: {{ common.INSTANCES_DEFAULT_X86 }} - oss: ["alinux2"] - schedulers: ["slurm", "awsbatch"] +{% filter indent(2) %} +{% include 'common/common.yaml' %} +{% endfilter %} runtime_bake: test_runtime_bake.py::test_runtime_bake: # These are currently skipped dimensions: @@ -270,12 +26,6 @@ test-suites: schedulers: ["{{ scheduler }}"] {%- endfor %} scaling: - test_scaling.py::test_hit_scaling: - dimensions: - - regions: ["us-west-1"] - instances: {{ common.INSTANCES_DEFAULT_X86 }} - oss: {{ common.OSS_ONE_PER_DISTRO }} - schedulers: ["slurm"] test_scaling.py::test_multiple_jobs_submission: dimensions: - regions: {{ common.REGIONS_COMMERCIAL }} @@ -298,256 +48,3 @@ test-suites: instances: {{ common.INSTANCES_DEFAULT_ARM }} oss: {{ common.OSS_COMMERCIAL_ARM }} schedulers: {{ common.SCHEDULERS_TRAD }} - test_scaling.py::test_nodewatcher_terminates_failing_node: - dimensions: - - regions: ["sa-east-1"] - instances: {{ common.INSTANCES_DEFAULT_X86 }} - oss: {{ common.OSS_ONE_PER_DISTRO }} - schedulers: ["sge", "torque"] - test_mpi.py::test_mpi: # TODO: move outside of the scaling dir - dimensions: - - regions: ["ap-east-1"] - instances: {{ common.INSTANCES_DEFAULT_X86 }} - oss: {{ common.OSS_COMMERCIAL_X86 }} - schedulers: ["slurm", "sge"] - - regions: ["us-east-1"] - instances: {{ common.INSTANCES_DEFAULT_ARM }} - oss: {{ common.OSS_COMMERCIAL_ARM }} - schedulers: ["slurm", "sge"] - test_mpi.py::test_mpi_ssh: - dimensions: - - regions: ["eu-north-1"] - instances: {{ common.INSTANCES_DEFAULT_X86 }} - oss: {{ common.OSS_COMMERCIAL_X86 }} - schedulers: ["slurm"] - - regions: ["ca-central-1"] - instances: {{ common.INSTANCES_DEFAULT_X86 }} - oss: {{ common.OSS_COMMERCIAL_X86 }} - schedulers: ["sge"] - schedulers: - test_sge.py::test_sge: - dimensions: - - regions: ["eu-central-1"] - instances: {{ common.INSTANCES_DEFAULT_X86 }} - oss: {{ common.OSS_COMMERCIAL_X86 }} - schedulers: ["sge"] - - regions: ["eu-central-1"] - instances: {{ common.INSTANCES_DEFAULT_ARM }} - oss: {{ common.OSS_COMMERCIAL_ARM }} - schedulers: ["sge"] - test_torque.py::test_torque: - dimensions: - - regions: ["ca-central-1"] - instances: {{ common.INSTANCES_DEFAULT_X86 }} - oss: {{ common.OSS_COMMERCIAL_X86 }} - schedulers: ["torque"] - - regions: ["ap-northeast-1"] - instances: {{ common.INSTANCES_DEFAULT_ARM }} - oss: {{ common.OSS_COMMERCIAL_ARM }} - schedulers: ["torque"] - test_awsbatch.py::test_awsbatch: - dimensions: - - regions: ["eu-north-1", "us-gov-west-1", "cn-north-1"] - instances: {{ common.INSTANCES_DEFAULT_X86 }} - oss: ["alinux2"] - schedulers: ["awsbatch"] - - regions: ["ap-southeast-1"] - instances: {{ common.INSTANCES_DEFAULT_X86 }} - oss: ["alinux"] - schedulers: ["awsbatch"] - - regions: ["ap-northeast-1"] - instances: {{ common.INSTANCES_DEFAULT_ARM }} - oss: ["alinux2"] - schedulers: ["awsbatch"] - test_slurm.py::test_slurm: - dimensions: - - regions: ["us-east-2"] - instances: {{ common.INSTANCES_DEFAULT_X86 }} - oss: {{ common.OSS_COMMERCIAL_X86 }} - schedulers: ["slurm"] - test_slurm.py::test_slurm_pmix: # TODO: include in main test_slurm to reduce number of created clusters - dimensions: - - regions: ["ap-southeast-2"] - instances: {{ common.INSTANCES_DEFAULT_X86 }} - oss: {{ common.OSS_COMMERCIAL_X86 }} - schedulers: ["slurm"] - - regions: ["ap-northeast-1"] - instances: {{ common.INSTANCES_DEFAULT_ARM }} - oss: {{ common.OSS_COMMERCIAL_ARM }} - schedulers: ["slurm"] - spot: - test_spot.py::test_spot_default: - dimensions: - - regions: ["us-west-2"] - instances: {{ common.INSTANCES_DEFAULT_X86 }} - oss: ["centos7"] - schedulers: ["sge", "slurm"] - storage: - test_fsx_lustre.py::test_fsx_lustre: - dimensions: - - regions: ["us-east-2"] - instances: {{ common.INSTANCES_DEFAULT_X86 }} - oss: ["alinux2", "centos7", "centos8", "ubuntu1604", "ubuntu1804"] - schedulers: ["slurm"] - - regions: ["eu-west-1"] - instances: {{ common.INSTANCES_DEFAULT_ARM }} - # FSx is only supported on ARM instances for Ubuntu 18.04, Amazon Linux 2 and CentOS 8 - oss: ["alinux2", "ubuntu1804", "centos8"] - schedulers: ["slurm"] - - regions: [ "cn-north-1" ] - instances: {{ common.INSTANCES_DEFAULT_X86 }} - oss: [ "alinux" ] - schedulers: [ "slurm" ] - test_fsx_lustre.py::test_fsx_lustre_configuration_options: - dimensions: - - regions: ["us-east-2"] - instances: {{ common.INSTANCES_DEFAULT_X86 }} - oss: ["alinux2"] - schedulers: ["slurm"] - test_fsx_lustre.py::test_fsx_lustre_backup: - dimensions: - - regions: ["us-west-2"] - instances: {{ common.INSTANCES_DEFAULT_X86 }} - oss: {{ common.OSS_ONE_PER_DISTRO }} - schedulers: ["sge"] - - regions: ["us-west-2"] - instances: {{ common.INSTANCES_DEFAULT_ARM }} - # FSx is only supported on ARM instances for Ubuntu 18.04, Amazon Linux 2 and CentOS 8 - oss: ["alinux2", "ubuntu1804", "centos8"] - schedulers: ["sge"] - test_efs.py::test_efs_compute_az: - dimensions: - - regions: ["us-west-1"] - instances: {{ common.INSTANCES_DEFAULT_X86 }} - oss: ["alinux2"] - schedulers: ["slurm", "awsbatch"] - test_efs.py::test_efs_same_az: - dimensions: - - regions: ["ap-northeast-1"] - instances: {{ common.INSTANCES_DEFAULT_X86 }} - oss: {{ common.OSS_COMMERCIAL_X86 }} - schedulers: ["slurm"] - - regions: ["cn-north-1"] - instances: {{ common.INSTANCES_DEFAULT_X86 }} - oss: {{ common.OSS_CHINA_X86 }} - schedulers: ["slurm"] - - regions: ["ap-northeast-1", "cn-north-1"] - instances: {{ common.INSTANCES_DEFAULT_X86 }} - oss: {{ common.OSS_BATCH }} - schedulers: ["awsbatch"] - test_raid.py::test_raid_fault_tolerance_mode: - dimensions: - - regions: ["cn-northwest-1"] - instances: {{ common.INSTANCES_DEFAULT_X86 }} - oss: ["alinux2"] - schedulers: ["slurm"] - test_raid.py::test_raid_performance_mode: - dimensions: - - regions: ["ap-south-1"] - instances: {{ common.INSTANCES_DEFAULT_X86 }} - oss: {{ common.OSS_COMMERCIAL_X86 }} - schedulers: ["sge"] - - regions: ["us-gov-east-1"] - instances: {{ common.INSTANCES_DEFAULT_X86 }} - oss: {{ common.OSS_GOVCLOUD_X86 }} - schedulers: ["sge"] - - regions: ["ap-south-1"] - instances: {{ common.INSTANCES_DEFAULT_X86 }} - oss: {{ common.OSS_BATCH }} - schedulers: ["awsbatch"] - test_ebs.py::test_default_ebs: - dimensions: - - regions: ["cn-northwest-1"] - instances: {{ common.INSTANCES_DEFAULT_X86 }} - oss: ["alinux"] - schedulers: ["slurm"] - test_ebs.py::test_ebs_multiple: - dimensions: - - regions: ["ca-central-1"] - instances: {{ common.INSTANCES_DEFAULT_X86 }} - oss: ["alinux2"] - schedulers: ["awsbatch"] - - regions: ["ca-central-1"] - instances: {{ common.INSTANCES_DEFAULT_X86 }} - oss: ["ubuntu1804"] - schedulers: ["slurm"] - test_ebs.py::test_ebs_single: - dimensions: - {%- for region, oss in [("eu-west-3", common.OSS_COMMERCIAL_X86), ("cn-north-1", common.OSS_CHINA_X86), ("us-gov-west-1", common.OSS_GOVCLOUD_X86)] %} - - regions: ["{{ region }}"] - instances: {{ common.INSTANCES_DEFAULT_X86 }} - oss: {{ oss }} - schedulers: ["sge"] - {%- endfor %} - test_ebs.py::test_ebs_single_empty: - dimensions: - - regions: ["us-gov-east-1"] - instances: {{ common.INSTANCES_DEFAULT_X86 }} - oss: ["ubuntu1604"] - schedulers: ["torque"] - test_ebs.py::test_ebs_snapshot: - dimensions: - - regions: ["ap-northeast-2"] - instances: {{ common.INSTANCES_DEFAULT_X86 }} - oss: ["alinux2"] - schedulers: ["sge"] - - regions: ["cn-northwest-1"] - instances: {{ common.INSTANCES_DEFAULT_X86 }} - oss: ["ubuntu1804"] - schedulers: ["slurm"] - tags: - test_tag_propagation.py::test_tag_propagation: - dimensions: - - regions: ["ap-southeast-1"] - instances: {{ common.INSTANCES_DEFAULT_X86 }} - oss: ["alinux2"] - schedulers: ["slurm", "torque", "awsbatch"] - update: - test_update.py::test_update_awsbatch: - dimensions: - - regions: ["eu-west-1"] - instances: {{ common.INSTANCES_DEFAULT_X86 }} - oss: ["alinux2"] - schedulers: ["awsbatch"] - test_update.py::test_update_hit: - dimensions: - - regions: ["eu-west-1"] - instances: {{ common.INSTANCES_DEFAULT_X86 }} - oss: {{ common.OSS_COMMERCIAL_X86 }} - schedulers: ["slurm"] - test_update.py::test_update_sit: - dimensions: - - regions: ["eu-west-2"] - instances: {{ common.INSTANCES_DEFAULT_X86 }} - oss: ["centos7"] - schedulers: ["sge"] - test_update.py::test_sit_update_compute_instance_disable_ht: - dimensions: - - regions: ["us-west-1"] - instances: {{ common.INSTANCES_DEFAULT_X86 }} - oss: ["centos7"] - schedulers: ["sge"] - test_update.py::test_sit_update_compute_instance_extra_json: - dimensions: - - regions: ["us-west-1"] - instances: {{ common.INSTANCES_DEFAULT_X86 }} - oss: ["centos7"] - schedulers: ["sge"] - multiple_nics: - test_multiple_nics.py::test_multiple_nics: - dimensions: - - regions: ["us-east-1"] - instances: ["p4d.24xlarge"] - oss: ["alinux2", "ubuntu1604", "centos8"] - schedulers: ["slurm"] - - regions: ["us-east-1"] - instances: ["p4d.24xlarge"] - oss: ["alinux", "ubuntu1804", "centos7"] - schedulers: ["slurm"] - resource_bucket: - test_resource_bucket.py::test_resource_bucket: - dimensions: - - regions: ["ap-southeast-1"] - instances: {{ common.INSTANCES_DEFAULT_X86 }} - oss: ["alinux2"] - schedulers: ["slurm", "awsbatch"] \ No newline at end of file diff --git a/tests/integration-tests/configs/released.yaml b/tests/integration-tests/configs/released.yaml new file mode 100644 index 0000000000..07d642b856 --- /dev/null +++ b/tests/integration-tests/configs/released.yaml @@ -0,0 +1,6 @@ +{%- import 'common.jinja2' as common -%} +--- +test-suites: +{% filter indent(2) %} +{% include 'common/common.yaml' %} +{% endfilter %} From 28c23cfa4ec097920ca6612bfe1fad64032f99b9 Mon Sep 17 00:00:00 2001 From: Hanwen <68928867+hanwen-pcluste@users.noreply.github.com> Date: Fri, 20 Nov 2020 10:38:36 -0500 Subject: [PATCH 05/66] integ-tests: test security groups are properly applied to a cluster (#2234) 1. Test `additional_sg` in the config file is added to head and compute nodes 2. Test `ssh_from` in the config file applies to the pcluster security group of the head node 3. Test `vpc_security_group_id` in the config file overwrites security group of head and compute nodes, FSx, and EFS Signed-off-by: Hanwen --- .../configs/common/common.yaml | 20 +++ tests/integration-tests/conftest.py | 2 +- tests/integration-tests/tests/dcv/test_dcv.py | 20 +-- .../tests/networking/test_security_groups.py | 152 ++++++++++++++++++ .../pcluster.config.ini | 28 ++++ .../test_overwrite_sg/pcluster.config.ini | 37 +++++ tests/integration-tests/utils.py | 10 ++ 7 files changed, 256 insertions(+), 13 deletions(-) create mode 100644 tests/integration-tests/tests/networking/test_security_groups.py create mode 100644 tests/integration-tests/tests/networking/test_security_groups/test_additional_sg_and_ssh_from/pcluster.config.ini create mode 100644 tests/integration-tests/tests/networking/test_security_groups/test_overwrite_sg/pcluster.config.ini diff --git a/tests/integration-tests/configs/common/common.yaml b/tests/integration-tests/configs/common/common.yaml index 64d8f41553..140a2e367b 100644 --- a/tests/integration-tests/configs/common/common.yaml +++ b/tests/integration-tests/configs/common/common.yaml @@ -245,6 +245,26 @@ networking: instances: {{ common.INSTANCES_DEFAULT_X86 }} oss: ["alinux2"] schedulers: ["slurm", "awsbatch"] + test_security_groups.py::test_additional_sg_and_ssh_from: + dimensions: + - regions: ["eu-north-1"] + instances: {{ common.INSTANCES_DEFAULT_X86 }} + oss: ["centos7"] + schedulers: ["slurm"] + - regions: ["eu-north-1"] + instances: {{ common.INSTANCES_DEFAULT_X86 }} + oss: ["alinux2"] + schedulers: ["awsbatch"] + test_security_groups.py::test_overwrite_sg: + dimensions: + - regions: ["eu-north-1"] + instances: {{ common.INSTANCES_DEFAULT_X86 }} + oss: ["centos7"] + schedulers: ["slurm"] + - regions: ["eu-north-1"] + instances: {{ common.INSTANCES_DEFAULT_X86 }} + oss: ["alinux2"] + schedulers: ["awsbatch"] scaling: test_scaling.py::test_hit_scaling: dimensions: diff --git a/tests/integration-tests/conftest.py b/tests/integration-tests/conftest.py index 5b27aa7b4f..a1070af0e7 100644 --- a/tests/integration-tests/conftest.py +++ b/tests/integration-tests/conftest.py @@ -617,7 +617,7 @@ def vpc_stacks(cfn_stacks_factory, request): return vpc_stacks -@pytest.fixture() +@pytest.fixture(scope="class") def vpc_stack(vpc_stacks, region): return vpc_stacks[region] diff --git a/tests/integration-tests/tests/dcv/test_dcv.py b/tests/integration-tests/tests/dcv/test_dcv.py index 511a2fff00..c7ddb9e789 100644 --- a/tests/integration-tests/tests/dcv/test_dcv.py +++ b/tests/integration-tests/tests/dcv/test_dcv.py @@ -12,11 +12,16 @@ import os as operating_system import re -import boto3 import pytest from assertpy import assert_that from remote_command_executor import RemoteCommandExecutor -from utils import add_keys_to_known_hosts, get_username_for_os, remove_keys_from_known_hosts, run_command +from utils import ( + add_keys_to_known_hosts, + check_headnode_security_group, + get_username_for_os, + remove_keys_from_known_hosts, + run_command, +) from tests.cloudwatch_logging.test_cloudwatch_logging import FeatureSpecificCloudWatchLoggingTestRunner @@ -106,7 +111,7 @@ def _test_dcv_configuration( remote_command_executor = RemoteCommandExecutor(cluster) # check configuration parameters - _check_security_group(region, cluster, dcv_port, expected_cidr=access_from) + check_headnode_security_group(region, cluster, dcv_port, expected_cidr=access_from) # dcv connect show url env = operating_system.environ.copy() @@ -198,15 +203,6 @@ def _check_auth_ok(remote_command_executor, external_authenticator_port, session ).is_equal_to('{0}'.format(username)) -def _check_security_group(region, cluster, port, expected_cidr): - security_group_id = cluster.cfn_resources.get("MasterSecurityGroup") - response = boto3.client("ec2", region_name=region).describe_security_groups(GroupIds=[security_group_id]) - - ips = response["SecurityGroups"][0]["IpPermissions"] - target = next(filter(lambda x: x.get("FromPort", -1) == port, ips), {}) - assert_that(target["IpRanges"][0]["CidrIp"]).is_equal_to(expected_cidr) - - def _check_no_crashes(remote_command_executor, test_datadir): """Verify no core files in /var/crash, which on ubuntu18 causes a popup when logging into the 1st session.""" remote_command_executor.run_remote_script(str(test_datadir / "verify_no_core_files.sh")) diff --git a/tests/integration-tests/tests/networking/test_security_groups.py b/tests/integration-tests/tests/networking/test_security_groups.py new file mode 100644 index 0000000000..5c7e4481cb --- /dev/null +++ b/tests/integration-tests/tests/networking/test_security_groups.py @@ -0,0 +1,152 @@ +# Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). +# You may not use this file except in compliance with the License. +# A copy of the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "LICENSE.txt" file accompanying this file. +# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied. +# See the License for the specific language governing permissions and limitations under the License. +import logging + +import boto3 +import pytest +from assertpy import assert_that +from cfn_stacks_factory import CfnStack +from troposphere import Ref, Template +from troposphere.ec2 import SecurityGroup, SecurityGroupIngress +from utils import check_headnode_security_group, random_alphanumeric + + +@pytest.mark.usefixtures("os", "scheduler", "instance") +def test_additional_sg_and_ssh_from(region, custom_security_group, pcluster_config_reader, clusters_factory): + """ + Test when additional_sg ssh_from are provided in the config file + + The additional security group should be added to the head and compute nodes. The + """ + custom_security_group_id = custom_security_group.cfn_resources["SecurityGroupResource"] + ssh_from = "10.11.12.0/32" + cluster_config = pcluster_config_reader(additional_sg=custom_security_group_id, ssh_from=ssh_from) + cluster = clusters_factory(cluster_config) + ec2_client = boto3.client("ec2", region_name=region) + instances = _get_instances_by_security_group(ec2_client, custom_security_group_id) + logging.info("Asserting that head node and compute node has the additional security group") + assert_that(instances).is_length(2) + logging.info("Asserting the security group of pcluster is not overwritten by additional seurity group") + for instance in instances: + assert_that( + any( + security_group["GroupName"].startswith("parallelcluster") + for security_group in instance["SecurityGroups"] + ) + ).is_true() + logging.info("Asserting the security group of pcluster on the head node is aligned with ssh_from") + check_headnode_security_group(region, cluster, 22, ssh_from) + + +@pytest.mark.usefixtures("os", "scheduler", "instance") +def test_overwrite_sg(region, custom_security_group, pcluster_config_reader, clusters_factory): + """Test vpc_security_group_id overwrites pcluster default sg on head and compute nodes, efs, fsx""" + custom_security_group_id = custom_security_group.cfn_resources["SecurityGroupResource"] + cluster_config = pcluster_config_reader(vpc_security_group_id=custom_security_group_id) + cluster = clusters_factory(cluster_config) + ec2_client = boto3.client("ec2", region_name=region) + instances = _get_instances_by_security_group(ec2_client, custom_security_group_id) + logging.info("Asserting that head node and compute node has and only has the custom security group") + assert_that(instances).is_length(2) + for instance in instances: + assert_that(instance["SecurityGroups"]).is_length(1) + + cfn_client = boto3.client("cloudformation", region_name=region) + + logging.info("Collecting security groups of the FSx") + fsx_id = cfn_client.describe_stack_resource( + StackName=cluster.cfn_resources["FSXSubstack"], LogicalResourceId="FileSystem" + )["StackResourceDetail"]["PhysicalResourceId"] + fsx_client = boto3.client("fsx", region_name=region) + network_interface_id = fsx_client.describe_file_systems(FileSystemIds=[fsx_id])["FileSystems"][0][ + "NetworkInterfaceIds" + ][0] + fsx_security_groups = ec2_client.describe_network_interfaces(NetworkInterfaceIds=[network_interface_id])[ + "NetworkInterfaces" + ][0]["Groups"] + logging.info("Asserting the network interface of FSx has and only has the custom security group") + assert_that(fsx_security_groups[0]["GroupId"]).is_equal_to(custom_security_group_id) + assert_that(fsx_security_groups).is_length(1) + + logging.info("Collecting security groups of the EFS") + efs_id = cfn_client.describe_stack_resource( + StackName=cluster.cfn_resources["EFSSubstack"], LogicalResourceId="EFSFS" + )["StackResourceDetail"]["PhysicalResourceId"] + efs_client = boto3.client("efs", region_name=region) + mount_target_ids = [ + mount_target["MountTargetId"] + for mount_target in efs_client.describe_mount_targets(FileSystemId=efs_id)["MountTargets"] + ] + logging.info("Asserting the mount targets of EFS has and only has the custom security group") + for mount_target_id in mount_target_ids: + mount_target_security_groups = efs_client.describe_mount_target_security_groups(MountTargetId=mount_target_id)[ + "SecurityGroups" + ] + assert_that(mount_target_security_groups[0]).is_equal_to(custom_security_group_id) + assert_that(mount_target_security_groups).is_length(1) + + +@pytest.fixture(scope="class") +def custom_security_group(vpc_stack, region, request, cfn_stacks_factory): + template = Template() + template.set_version("2010-09-09") + template.set_description("custom security group stack for testing additional_sg and vpc_security_group_id") + security_group = template.add_resource( + SecurityGroup( + "SecurityGroupResource", + GroupDescription="custom security group for testing additional_sg and vpc_security_group_id", + VpcId=vpc_stack.cfn_outputs["VpcId"], + ) + ) + template.add_resource( + SecurityGroupIngress( + "SecurityGroupIngressResource", + IpProtocol="-1", + FromPort=0, + ToPort=65535, + SourceSecurityGroupId=Ref(security_group), + GroupId=Ref(security_group), + ) + ) + stack = CfnStack( + name="integ-tests-custom-sg-{0}{1}{2}".format( + random_alphanumeric(), + "-" if request.config.getoption("stackname_suffix") else "", + request.config.getoption("stackname_suffix"), + ), + region=region, + template=template.to_json(), + ) + cfn_stacks_factory.create_stack(stack) + + yield stack + + if not request.config.getoption("no_delete"): + cfn_stacks_factory.delete_stack(stack.name, region) + + +def _get_instances_by_security_group(ec2_client, security_group_id): + logging.info("Collecting security groups of the head node and compute node") + paginator = ec2_client.get_paginator("describe_instances") + page_iterator = paginator.paginate( + Filters=[ + { + "Name": "network-interface.group-id", + "Values": [security_group_id], + } + ] + ) + instances = [] + for page in page_iterator: + for reservation in page["Reservations"]: + instances.extend(reservation["Instances"]) + return instances diff --git a/tests/integration-tests/tests/networking/test_security_groups/test_additional_sg_and_ssh_from/pcluster.config.ini b/tests/integration-tests/tests/networking/test_security_groups/test_additional_sg_and_ssh_from/pcluster.config.ini new file mode 100644 index 0000000000..fa0ff66786 --- /dev/null +++ b/tests/integration-tests/tests/networking/test_security_groups/test_additional_sg_and_ssh_from/pcluster.config.ini @@ -0,0 +1,28 @@ +[global] +cluster_template = default + +[aws] +aws_region_name = {{ region }} + +[cluster default] +base_os = {{ os }} +key_name = {{ key_name }} +vpc_settings = parallelcluster-vpc +scheduler = {{ scheduler }} +master_instance_type = {{ instance }} +compute_instance_type = {{ instance }} +{% if scheduler == "awsbatch" %} +min_vcpus = 1 +desired_vcpus = 1 +{% else %} +initial_queue_size = 1 +maintain_initial_size = true +{% endif %} + +[vpc parallelcluster-vpc] +vpc_id = {{ vpc_id }} +master_subnet_id = {{ public_subnet_id }} +compute_subnet_id = {{ private_additional_cidr_subnet_id }} +additional_sg = {{ additional_sg }} +use_public_ips = false +ssh_from = {{ ssh_from }} \ No newline at end of file diff --git a/tests/integration-tests/tests/networking/test_security_groups/test_overwrite_sg/pcluster.config.ini b/tests/integration-tests/tests/networking/test_security_groups/test_overwrite_sg/pcluster.config.ini new file mode 100644 index 0000000000..254d4f10f4 --- /dev/null +++ b/tests/integration-tests/tests/networking/test_security_groups/test_overwrite_sg/pcluster.config.ini @@ -0,0 +1,37 @@ +[global] +cluster_template = default + +[aws] +aws_region_name = {{ region }} + +[cluster default] +base_os = {{ os }} +key_name = {{ key_name }} +vpc_settings = parallelcluster-vpc +scheduler = {{ scheduler }} +master_instance_type = {{ instance }} +compute_instance_type = {{ instance }} +{% if scheduler == "awsbatch" %} +min_vcpus = 1 +desired_vcpus = 1 +{% else %} +initial_queue_size = 1 +maintain_initial_size = true +{% endif %} +efs_settings = parallelcluster-efs +fsx_settings = parallelcluster-fsx + +[vpc parallelcluster-vpc] +vpc_id = {{ vpc_id }} +master_subnet_id = {{ public_subnet_id }} +compute_subnet_id = {{ private_additional_cidr_subnet_id }} +vpc_security_group_id = {{ vpc_security_group_id }} +use_public_ips = false + +[efs parallelcluster-efs] +shared_dir = efs + +[fsx parallelcluster-fsx] +shared_dir = fsx +storage_capacity = 1200 +deployment_type = SCRATCH_2 diff --git a/tests/integration-tests/utils.py b/tests/integration-tests/utils.py index c47061e4bf..d0712770c8 100644 --- a/tests/integration-tests/utils.py +++ b/tests/integration-tests/utils.py @@ -362,3 +362,13 @@ def get_architecture_supported_by_instance_type(instance_type, region_name=None) assert_that(len(instance_architectures)).is_equal_to(1) return instance_architectures[0] + + +def check_headnode_security_group(region, cluster, port, expected_cidr): + """Check CIDR restriction for a port is in the security group of the head node of the cluster""" + security_group_id = cluster.cfn_resources.get("MasterSecurityGroup") + response = boto3.client("ec2", region_name=region).describe_security_groups(GroupIds=[security_group_id]) + + ips = response["SecurityGroups"][0]["IpPermissions"] + target = next(filter(lambda x: x.get("FromPort", -1) == port, ips), {}) + assert_that(target["IpRanges"][0]["CidrIp"]).is_equal_to(expected_cidr) From 7b5d6fd0e439042c5b0d3a697a0b809f3deeb961 Mon Sep 17 00:00:00 2001 From: Yulei <68350383+yuleiwan@users.noreply.github.com> Date: Fri, 20 Nov 2020 07:56:46 -0800 Subject: [PATCH 06/66] integ-tests: test deployment types (#2233) * This test is to verify FSx file system launched by pcluster has the correct deployment type as user set in pcluster config file. * FSx file system has three deployment types in commercial regions SCRATCH_1(dafault), SCRATCH_2, PERSISTENT_1 Signed-off-by: Yulei Wang --- .../tests/storage/test_fsx_lustre.py | 42 ++++++++++++++++--- 1 file changed, 37 insertions(+), 5 deletions(-) diff --git a/tests/integration-tests/tests/storage/test_fsx_lustre.py b/tests/integration-tests/tests/storage/test_fsx_lustre.py index fa81c5dd1f..5fb5be6973 100644 --- a/tests/integration-tests/tests/storage/test_fsx_lustre.py +++ b/tests/integration-tests/tests/storage/test_fsx_lustre.py @@ -84,14 +84,16 @@ def test_fsx_lustre_configuration_options( storage_capacity=storage_capacity, ) cluster = clusters_factory(cluster_config) - _test_fsx_lustre(cluster, region, scheduler, os, mount_dir, bucket_name, storage_type, auto_import_policy) + _test_fsx_lustre( + cluster, region, scheduler, os, mount_dir, bucket_name, storage_type, auto_import_policy, deployment_type + ) @pytest.mark.regions(["eu-west-1"]) @pytest.mark.instances(["c5.xlarge", "m6g.xlarge"]) @pytest.mark.schedulers(["slurm"]) @pytest.mark.usefixtures("instance") -# FSx is only supported on ARM instances for Ubuntu 18.04 and Amazon Linux 2 +# FSx is only supported on ARM instances for Ubuntu 18.04, Amazon Linux 2 and CentOS 8 @pytest.mark.skip_dimensions("*", "m6g.xlarge", "alinux", "*") @pytest.mark.skip_dimensions("*", "m6g.xlarge", "centos7", "*") @pytest.mark.skip_dimensions("*", "m6g.xlarge", "ubuntu1604", "*") @@ -119,10 +121,22 @@ def test_fsx_lustre( storage_capacity=1200, ) cluster = clusters_factory(cluster_config) - _test_fsx_lustre(cluster, region, scheduler, os, mount_dir, bucket_name, storage_type=None, auto_import_policy=None) + _test_fsx_lustre( + cluster, + region, + scheduler, + os, + mount_dir, + bucket_name, + storage_type=None, + auto_import_policy=None, + deployment_type=None, + ) -def _test_fsx_lustre(cluster, region, scheduler, os, mount_dir, bucket_name, storage_type, auto_import_policy): +def _test_fsx_lustre( + cluster, region, scheduler, os, mount_dir, bucket_name, storage_type, auto_import_policy, deployment_type +): remote_command_executor = RemoteCommandExecutor(cluster) scheduler_commands = get_scheduler_commands(scheduler, remote_command_executor) fsx_fs_id = get_fsx_fs_id(cluster, region) @@ -131,6 +145,7 @@ def _test_fsx_lustre(cluster, region, scheduler, os, mount_dir, bucket_name, sto _test_import_path(remote_command_executor, mount_dir) _test_fsx_lustre_correctly_shared(scheduler_commands, remote_command_executor, mount_dir) _test_storage_type(storage_type, fsx_fs_id, region) + _test_deployment_type(deployment_type, fsx_fs_id, region) _test_export_path(remote_command_executor, mount_dir, bucket_name, region) _test_auto_import(auto_import_policy, remote_command_executor, mount_dir, bucket_name, region) _test_data_repository_task(remote_command_executor, mount_dir, bucket_name, fsx_fs_id, region) @@ -140,7 +155,7 @@ def _test_fsx_lustre(cluster, region, scheduler, os, mount_dir, bucket_name, sto @pytest.mark.instances(["c5.xlarge", "m6g.xlarge"]) @pytest.mark.schedulers(["sge"]) @pytest.mark.usefixtures("instance") -# FSx is only supported on ARM instances for Ubuntu 18.04 and Amazon Linux 2 +# FSx is only supported on ARM instances for Ubuntu 18.04, Amazon Linux 2 and CentOS 8 @pytest.mark.skip_dimensions("*", "m6g.xlarge", "alinux", "*") @pytest.mark.skip_dimensions("*", "m6g.xlarge", "centos7", "*") @pytest.mark.skip_dimensions("*", "m6g.xlarge", "ubuntu1604", "*") @@ -264,6 +279,23 @@ def _test_storage_type(storage_type, fsx_fs_id, region): assert_that(get_storage_type(fsx_fs_id, region)).is_equal_to("SSD") +def _get_deployment_type(fsx_fs_id, region): + deployment_type = ( + boto3.client("fsx", region_name=region) + .describe_file_systems(FileSystemIds=[fsx_fs_id]) + .get("FileSystems")[0] + .get("LustreConfiguration") + .get("DeploymentType") + ) + logging.info(f"Getting DeploymentType {deployment_type} from DescribeFilesystem API.") + return deployment_type + + +def _test_deployment_type(deployment_type, fsx_fs_id, region): + if deployment_type: + assert_that(_get_deployment_type(fsx_fs_id, region)).is_equal_to(deployment_type) + + def _test_import_path(remote_command_executor, mount_dir): logging.info("Testing fsx lustre import path") result = remote_command_executor.run_remote_command("cat {mount_dir}/s3_test_file".format(mount_dir=mount_dir)) From f296e3c57e45fb465efdd8ed45834adee17f5fd4 Mon Sep 17 00:00:00 2001 From: Francesco De Martino Date: Mon, 23 Nov 2020 11:14:45 +0100 Subject: [PATCH 07/66] Bump version to 2.10.1 Signed-off-by: Francesco De Martino --- CHANGELOG.md | 3 +++ cli/setup.py | 2 +- cloudformation/aws-parallelcluster.cfn.json | 6 +++--- 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index eb9bd30d3e..459b65ae25 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,9 @@ CHANGELOG ========= +2.10.1 +------ + 2.10.0 ------ diff --git a/cli/setup.py b/cli/setup.py index d5b3a752cf..33fc52695e 100644 --- a/cli/setup.py +++ b/cli/setup.py @@ -21,7 +21,7 @@ def readme(): return f.read() -VERSION = "2.10.0" +VERSION = "2.10.1" REQUIRES = [ "setuptools", "boto3>=1.16.14", diff --git a/cloudformation/aws-parallelcluster.cfn.json b/cloudformation/aws-parallelcluster.cfn.json index 5dafbc0235..2cab11c7d7 100644 --- a/cloudformation/aws-parallelcluster.cfn.json +++ b/cloudformation/aws-parallelcluster.cfn.json @@ -1,6 +1,6 @@ { "AWSTemplateFormatVersion": "2010-09-09", - "Description": "AWS ParallelCluster Template. Version: aws-parallelcluster-2.10.0", + "Description": "AWS ParallelCluster Template. Version: aws-parallelcluster-2.10.1", "Parameters": { "KeyName": { "Description": "Name of an existing EC2 KeyPair to enable SSH access to the instances using the default cluster user.", @@ -1180,8 +1180,8 @@ }, "PackagesVersions": { "default": { - "parallelcluster": "2.10.0", - "cookbook": "aws-parallelcluster-cookbook-2.10.0", + "parallelcluster": "2.10.1", + "cookbook": "aws-parallelcluster-cookbook-2.10.1", "chef": "15.11.8", "berkshelf": "7.0.10", "ami": "dev" From 19894892e0333e43a6e2d33a07005323e9f81304 Mon Sep 17 00:00:00 2001 From: Tim Lane Date: Wed, 25 Nov 2020 09:36:01 -0800 Subject: [PATCH 08/66] Continue polling FSx backup when state is PENDING Previously we were only continuing to poll when the state was one of CREATING or TRANSFERRING. According to the boto3 docs, we should also handle the PENDING state as well. Signed-off-by: Tim Lane --- tests/integration-tests/tests/storage/test_fsx_lustre.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration-tests/tests/storage/test_fsx_lustre.py b/tests/integration-tests/tests/storage/test_fsx_lustre.py index 5fb5be6973..79bb16254f 100644 --- a/tests/integration-tests/tests/storage/test_fsx_lustre.py +++ b/tests/integration-tests/tests/storage/test_fsx_lustre.py @@ -24,7 +24,7 @@ from tests.common.schedulers_common import get_scheduler_commands -BACKUP_NOT_YET_AVAILABLE_STATES = {"CREATING", "TRANSFERRING"} +BACKUP_NOT_YET_AVAILABLE_STATES = {"CREATING", "TRANSFERRING", "PENDING"} # Maximum number of minutes to wait past when an file system's automatic backup is scheduled to start creating. # If after this many minutes past the scheduled time backup creation has not started, the test will fail. MAX_MINUTES_TO_WAIT_FOR_AUTOMATIC_BACKUP_START = 5 From 6d05ac62e1eddbd3583663038c767e23f6fe848b Mon Sep 17 00:00:00 2001 From: Luca Carrogu Date: Wed, 25 Nov 2020 11:18:44 +0100 Subject: [PATCH 09/66] Fix torque test when checking for scheduler configuration Add 1 second sleep to give time to sqswatcher to reconfigure the master with np = max_nodes * node_slots This operation is performed right after sqswatcher removes the compute nodes from the scheduler Signed-off-by: Luca Carrogu --- tests/integration-tests/tests/schedulers/test_torque.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/integration-tests/tests/schedulers/test_torque.py b/tests/integration-tests/tests/schedulers/test_torque.py index afa568e05b..12434bd5ba 100644 --- a/tests/integration-tests/tests/schedulers/test_torque.py +++ b/tests/integration-tests/tests/schedulers/test_torque.py @@ -192,7 +192,9 @@ def _test_dynamic_cluster_limits(remote_command_executor, max_queue_size, max_sl # Make sure cluster is scaled to 0 when this test starts assert_that(torque_commands.compute_nodes_count()).is_equal_to(0) - + # sleeping for 1 second to give time to sqswatcher to reconfigure the master with np = max_nodes * node_slots + # operation that is performed right after sqswatcher removes the compute nodes from the scheduler + time.sleep(1) _assert_scheduler_configuration(remote_command_executor, torque_commands, max_slots, max_queue_size) # Submit a job to scale up to 1 node From 2a7e74d66e573790a3951fcc0820413a2b9be8cd Mon Sep 17 00:00:00 2001 From: Francesco De Martino Date: Fri, 20 Nov 2020 15:24:05 +0100 Subject: [PATCH 10/66] Optimize calls to describe_instance_types API Signed-off-by: Francesco De Martino --- cli/src/pcluster/config/cfn_param_types.py | 21 +- cli/src/pcluster/config/json_param_types.py | 27 ++- cli/src/pcluster/config/validators.py | 7 +- cli/src/pcluster/createami.py | 4 +- .../pcluster/models/hit/hit_cluster_model.py | 10 +- .../pcluster/models/sit/sit_cluster_model.py | 12 +- cli/src/pcluster/utils.py | 194 +++++++++++------- .../pcluster/config/test_config_patch.py | 12 +- .../pcluster/config/test_hit_converter.py | 32 +-- .../pcluster/config/test_json_param_types.py | 34 +-- cli/tests/pcluster/config/test_runtime.py | 6 +- .../pcluster/config/test_section_cluster.py | 10 +- .../config/test_source_consistency.py | 18 -- cli/tests/pcluster/config/test_utils.py | 36 ---- cli/tests/pcluster/config/test_validators.py | 10 +- cli/tests/pcluster/config/utils.py | 41 ++-- .../configure/test_pcluster_configure.py | 4 +- .../createami/test_pcluster_createami.py | 6 +- .../test_pcluster_utils.py => test_utils.py} | 186 +++++++++++++---- .../test_pcluster_config_convert.py | 2 +- 20 files changed, 394 insertions(+), 278 deletions(-) delete mode 100644 cli/tests/pcluster/config/test_utils.py rename cli/tests/pcluster/{utils/test_pcluster_utils.py => test_utils.py} (87%) diff --git a/cli/src/pcluster/config/cfn_param_types.py b/cli/src/pcluster/config/cfn_param_types.py index bfaa9b588f..7127323012 100644 --- a/cli/src/pcluster/config/cfn_param_types.py +++ b/cli/src/pcluster/config/cfn_param_types.py @@ -19,16 +19,14 @@ from pcluster.config.resource_map import ResourceMap from pcluster.constants import PCLUSTER_ISSUES_LINK from pcluster.utils import ( + InstanceTypeInfo, disable_ht_via_cpu_options, error, get_availability_zone_of_subnet, get_cfn_param, - get_default_threads_per_core, get_ebs_snapshot_info, get_efs_mount_target_id, get_file_section_name, - get_instance_network_interfaces, - get_instance_vcpus, get_supported_architectures_for_instance_type, ) @@ -655,12 +653,13 @@ def _get_cfn_params_for_instance_type(instance_type): HT is disabled (or "NONE" if it shouldn't be disabled). The second item is a boolean expressing if HT should be disabled via CPU Options for the given instance type. """ - default_threads_per_core = get_default_threads_per_core(instance_type) + instance_type_info = InstanceTypeInfo.init_from_instance_type(instance_type) + default_threads_per_core = instance_type_info.default_threads_per_core() if default_threads_per_core == 1: # no action is required to disable hyperthreading cores = "NONE" else: - cores = get_instance_vcpus(instance_type) // default_threads_per_core + cores = instance_type_info.vcpus_count() // default_threads_per_core return cores, disable_ht_via_cpu_options(instance_type, default_threads_per_core) @@ -1086,8 +1085,16 @@ def refresh(self): cluster_section = self.pcluster_config.get_section("cluster") scheduler = cluster_section.get_param_value("scheduler") self.value = [ - str(get_instance_network_interfaces(cluster_section.get_param_value("master_instance_type"))), - str(get_instance_network_interfaces(cluster_section.get_param_value("compute_instance_type"))) + str( + InstanceTypeInfo.init_from_instance_type( + cluster_section.get_param_value("master_instance_type") + ).max_network_interface_count() + ), + str( + InstanceTypeInfo.init_from_instance_type( + cluster_section.get_param_value("compute_instance_type") + ).max_network_interface_count() + ) if self.pcluster_config.cluster_model.name == "SIT" and scheduler != "awsbatch" else "1", ] diff --git a/cli/src/pcluster/config/json_param_types.py b/cli/src/pcluster/config/json_param_types.py index c490f9a855..097d941183 100644 --- a/cli/src/pcluster/config/json_param_types.py +++ b/cli/src/pcluster/config/json_param_types.py @@ -288,35 +288,34 @@ def refresh_compute_resource(self, compute_resource_section): instance_type_param = compute_resource_section.get_param("instance_type") if instance_type_param.value: - instance_type = utils.get_instance_type(instance_type_param.value) + instance_type_info = utils.InstanceTypeInfo.init_from_instance_type(instance_type_param.value) # Set vcpus according to queue's disable_hyperthreading and instance features ht_disabled = self.get_param_value("disable_hyperthreading") - vcpus_info = instance_type.get("VCpuInfo") - default_threads_per_core = utils.get_default_threads_per_core(instance_type_param.value, instance_type) + default_threads_per_core = instance_type_info.default_threads_per_core() vcpus = ( - (vcpus_info.get("DefaultVCpus") // default_threads_per_core) + (instance_type_info.vcpus_count() // default_threads_per_core) if ht_disabled - else vcpus_info.get("DefaultVCpus") + else instance_type_info.vcpus_count() ) compute_resource_section.get_param("vcpus").value = vcpus # Set gpus according to instance features - gpus = utils.get_instance_gpus(instance_type_param.value, instance_type) + gpus = instance_type_info.gpu_count() compute_resource_section.get_param("gpus").value = gpus # Set enable_efa according to queues' enable_efa and instance features # Instance type must support EFA enable_efa = self.get_param_value("enable_efa") - compute_resource_section.get_param("enable_efa").value = enable_efa and instance_type.get( - "NetworkInfo" - ).get("EfaSupported") + compute_resource_section.get_param("enable_efa").value = ( + enable_efa and instance_type_info.is_efa_supported() + ) # Set enable_efa_gdr according to queues' enable_efa_gdr and instance features # Instance type must support EFA and have GPUs enable_efa_gdr = self.get_param_value("enable_efa_gdr") compute_resource_section.get_param("enable_efa_gdr").value = ( - enable_efa_gdr and instance_type.get("NetworkInfo").get("EfaSupported") and (gpus > 0) + enable_efa_gdr and instance_type_info.is_efa_supported() and (gpus > 0) ) # Set disable_hyperthreading according to queues' disable_hyperthreading and instance features @@ -331,7 +330,7 @@ def refresh_compute_resource(self, compute_resource_section): ).value = compute_resource_section.get_param( "disable_hyperthreading" ).value and utils.disable_ht_via_cpu_options( - instance_type_param.value, utils.get_default_threads_per_core(instance_type_param.value, instance_type) + instance_type_param.value, instance_type_info.default_threads_per_core() ) # Set initial_count to min_count if not manually set @@ -340,9 +339,9 @@ def refresh_compute_resource(self, compute_resource_section): initial_count_param.value = compute_resource_section.get_param_value("min_count") # Set number of network interfaces - compute_resource_section.get_param("network_interfaces").value = utils.get_instance_network_interfaces( - instance_type_param.value, instance_type - ) + compute_resource_section.get_param( + "network_interfaces" + ).value = instance_type_info.max_network_interface_count() # ---------------------- Common functions ---------------------- # diff --git a/cli/src/pcluster/config/validators.py b/cli/src/pcluster/config/validators.py index d908f891a6..6c37fd0e1a 100644 --- a/cli/src/pcluster/config/validators.py +++ b/cli/src/pcluster/config/validators.py @@ -20,13 +20,12 @@ from pcluster.constants import CIDR_ALL_IPS, FSX_HDD_THROUGHPUT, FSX_SSD_THROUGHPUT from pcluster.dcv.utils import get_supported_dcv_os from pcluster.utils import ( + InstanceTypeInfo, ellipsize, get_base_additional_iam_policies, get_ebs_snapshot_info, get_efs_mount_target_id, get_file_section_name, - get_instance_network_interfaces, - get_instance_vcpus, get_partition, get_region, get_supported_architectures_for_instance_type, @@ -1019,7 +1018,7 @@ def compute_instance_type_validator(param_key, param_value, pcluster_config): if "," not in param_value and "." in param_value: # if the type is not a list, and contains dot (nor optimal, nor a family) # validate instance type against max_vcpus limit - vcpus = get_instance_vcpus(param_value) + vcpus = InstanceTypeInfo.init_from_instance_type(param_value).vcpus_count() if vcpus <= 0: warnings.append( "Unable to get the number of vcpus for the compute_instance_type '{0}'. " @@ -1036,7 +1035,7 @@ def compute_instance_type_validator(param_key, param_value, pcluster_config): if scheduler != "slurm": # Multiple NICs instance types are currently supported only with Slurm clusters - instance_nics = get_instance_network_interfaces(param_value) + instance_nics = InstanceTypeInfo.init_from_instance_type(param_value).max_network_interface_count() if instance_nics > 1: warnings.append( "Some services needed to support clusters with instance type '{0}' with multiple " diff --git a/cli/src/pcluster/createami.py b/cli/src/pcluster/createami.py index 409bd50a02..3d034fd7c6 100644 --- a/cli/src/pcluster/createami.py +++ b/cli/src/pcluster/createami.py @@ -244,9 +244,9 @@ def _get_default_createami_instance_type(ami_architecture): LOGGER.error("Base AMI used in createami has an unsupported architecture: {0}".format(ami_architecture)) sys.exit(1) - # Ensure instance type is avaiable in the selected region + # Ensure instance type is available in the selected region try: - utils.get_instance_types_info([instance_type], fail_on_error=True) + utils.InstanceTypeInfo.init_from_instance_type(instance_type) except SystemExit as system_exit: if "instance types do not exist" in str(system_exit): LOGGER.error( diff --git a/cli/src/pcluster/models/hit/hit_cluster_model.py b/cli/src/pcluster/models/hit/hit_cluster_model.py index 6ba716a8b5..5970d68346 100644 --- a/cli/src/pcluster/models/hit/hit_cluster_model.py +++ b/cli/src/pcluster/models/hit/hit_cluster_model.py @@ -12,7 +12,7 @@ from pcluster.cluster_model import ClusterModel from pcluster.config import mappings -from pcluster.utils import disable_ht_via_cpu_options, get_default_threads_per_core, get_instance_type +from pcluster.utils import InstanceTypeInfo, disable_ht_via_cpu_options class HITClusterModel(ClusterModel): @@ -59,14 +59,12 @@ def test_configuration(self, pcluster_config): # Initialize CpuOptions disable_hyperthreading = cluster_section.get_param_value("disable_hyperthreading") - master_instance_type_info = get_instance_type(master_instance_type) + master_instance_type_info = InstanceTypeInfo.init_from_instance_type(master_instance_type) # Set vcpus according to queue's disable_hyperthreading and instance features - vcpus_info = master_instance_type_info.get("VCpuInfo") - master_vcpus = vcpus_info.get("DefaultVCpus") + master_vcpus = master_instance_type_info.vcpus_count() - master_cpu_options = {"CoreCount": master_vcpus // 2, "ThreadsPerCore": 1} if disable_hyperthreading else {} - master_threads_per_core = get_default_threads_per_core(master_instance_type) + master_threads_per_core = master_instance_type_info.default_threads_per_core() master_cpu_options = ( {"CoreCount": master_vcpus // master_threads_per_core, "ThreadsPerCore": 1} if disable_hyperthreading and disable_ht_via_cpu_options(master_instance_type, master_threads_per_core) diff --git a/cli/src/pcluster/models/sit/sit_cluster_model.py b/cli/src/pcluster/models/sit/sit_cluster_model.py index 9f4a75c433..5dac1d231f 100644 --- a/cli/src/pcluster/models/sit/sit_cluster_model.py +++ b/cli/src/pcluster/models/sit/sit_cluster_model.py @@ -12,7 +12,7 @@ from pcluster.cluster_model import ClusterModel from pcluster.config import mappings -from pcluster.utils import disable_ht_via_cpu_options, get_default_threads_per_core, get_instance_vcpus +from pcluster.utils import InstanceTypeInfo, disable_ht_via_cpu_options class SITClusterModel(ClusterModel): @@ -82,10 +82,12 @@ def test_configuration(self, pcluster_config): # Initialize CpuOptions disable_hyperthreading = cluster_section.get_param_value("disable_hyperthreading") - master_vcpus = get_instance_vcpus(master_instance_type) - master_threads_per_core = get_default_threads_per_core(master_instance_type) - compute_vcpus = get_instance_vcpus(compute_instance_type) - compute_threads_per_core = get_default_threads_per_core(compute_instance_type) + master_instance_type_info = InstanceTypeInfo.init_from_instance_type(master_instance_type) + master_vcpus = master_instance_type_info.vcpus_count() + master_threads_per_core = master_instance_type_info.default_threads_per_core() + compute_instance_type_info = InstanceTypeInfo.init_from_instance_type(compute_instance_type) + compute_vcpus = compute_instance_type_info.vcpus_count() + compute_threads_per_core = compute_instance_type_info.default_threads_per_core() master_cpu_options = ( {"CoreCount": master_vcpus // master_threads_per_core, "ThreadsPerCore": 1} if disable_hyperthreading and disable_ht_via_cpu_options(master_instance_type, master_threads_per_core) diff --git a/cli/src/pcluster/utils.py b/cli/src/pcluster/utils.py index 9d317aec46..c479ad500d 100644 --- a/cli/src/pcluster/utils.py +++ b/cli/src/pcluster/utils.py @@ -10,6 +10,9 @@ # limitations under the License. # fmt: off from __future__ import absolute_import, print_function # isort:skip + +import functools + from future import standard_library # isort:skip standard_library.install_aliases() # fmt: on @@ -311,25 +314,6 @@ def upload_resources_artifacts(bucket_name, artifact_directory, root): bucket.upload_file(os.path.join(root, res), "%s/%s" % (artifact_directory, res)) -def get_instance_vcpus(instance_type, instance_info=None): - """ - Get number of vcpus for the given instance type. - - :param instance_type: the instance type to search for. - :return: the number of vcpus or -1 if the instance type cannot be found - """ - try: - if not instance_info: - instance_info = get_instance_type(instance_type) - - vcpus_info = instance_info.get("VCpuInfo") - vcpus = vcpus_info.get("DefaultVCpus") - except (ClientError): - vcpus = -1 - - return vcpus - - def get_supported_instance_types(): """Return the list of instance types available in the given region.""" ec2_client = boto3.client("ec2") @@ -939,20 +923,6 @@ def validate_pcluster_version_based_on_ami_name(ami_name): ) -def get_instance_types_info(instance_types, fail_on_error=True): - """Return InstanceTypes list returned by EC2's DescribeInstanceTypes API.""" - try: - ec2_client = boto3.client("ec2") - return ec2_client.describe_instance_types(InstanceTypes=instance_types).get("InstanceTypes") - except ClientError as e: - error( - "Error when calling DescribeInstanceTypes for instances {0}: {1}".format( - ", ".join(instance_types), e.response.get("Error").get("Message") - ), - fail_on_error, - ) - - def get_supported_architectures_for_instance_type(instance_type): """Get a list of architectures supported for the given instance type.""" # "optimal" compute instance type (when using batch) implies the use of instances from the @@ -961,8 +931,8 @@ def get_supported_architectures_for_instance_type(instance_type): if instance_type == "optimal": return ["x86_64"] - instance_info = get_instance_types_info([instance_type])[0] - supported_architectures = instance_info.get("ProcessorInfo").get("SupportedArchitectures") + instance_info = InstanceTypeInfo.init_from_instance_type(instance_type) + supported_architectures = instance_info.supported_architecture() # Some instance types support multiple architectures (x86_64 and i386). Filter unsupported ones. supported_architectures = list(set(supported_architectures) & set(SUPPORTED_ARCHITECTURES)) @@ -1094,35 +1064,10 @@ def cluster_has_running_capacity(stack_name): return cluster_has_running_capacity.cached_result -def get_instance_type(instance_type): - ec2_client = boto3.client("ec2") - try: - return ec2_client.describe_instance_types(InstanceTypes=[instance_type]).get("InstanceTypes")[0] - except Exception as e: - LOGGER.error("Failed when retrieving instance type data for instance type %s: %s", instance_type, e) - raise e - - -def get_default_threads_per_core(instance_type, instance_info=None): - """Return the default threads per core for the given instance type.""" - # NOTE: currently, .metal instances do not contain the DefaultThreadsPerCore - # attribute in their VCpuInfo section. This is a known issue with the - # ec2 DescribeInstanceTypes API. For these instance types an assumption - # is made that if the instance's supported architectures list includes - # x86_64 then the default is 2, otherwise it's 1. - if instance_info is None: - instance_info = get_instance_type(instance_type) - threads_per_core = instance_info.get("VCpuInfo", {}).get("DefaultThreadsPerCore") - if threads_per_core is None: - supported_architectures = instance_info.get("ProcessorInfo", {}).get("SupportedArchitectures", []) - threads_per_core = 2 if "x86_64" in supported_architectures else 1 - return threads_per_core - - def disable_ht_via_cpu_options(instance_type, default_threads_per_core=None): """Return a boolean describing whether hyperthreading should be disabled via CPU options for instance_type.""" if default_threads_per_core is None: - default_threads_per_core = get_default_threads_per_core(instance_type) + default_threads_per_core = InstanceTypeInfo.init_from_instance_type(instance_type).default_threads_per_core() res = all( [ # If default threads per core is 1, HT doesn't need to be disabled @@ -1225,25 +1170,122 @@ def get_ebs_snapshot_info(ebs_snapshot_id, raise_exceptions=False): ) -def get_instance_network_interfaces(instance_type, instance_info=None): - """Return the number of network interfaces to configure for the instance type.""" - if not instance_info: - instance_info = get_instance_type(instance_type) +class Cache: + """Simple utility class providing a cache mechanism for expensive functions.""" + + _caches = [] + + @staticmethod + def is_enabled(): + """Tell if the cache is enabled.""" + return not os.environ.get("PCLUSTER_CACHE_DISABLED") + + @staticmethod + def clear_all(): + """Clear the content of all caches.""" + for cache in Cache._caches: + cache.clear() + + @staticmethod + def _make_key(args, kwargs): + key = args + if kwargs: + for item in kwargs.items(): + key += item + return hash(key) + + @staticmethod + def cached(function): + """ + Decorate a function to make it use a results cache based on passed arguments. + + Note: all arguments must be hashable for this function to work properly. + """ + cache = {} + Cache._caches.append(cache) + + @functools.wraps(function) + def wrapper(*args, **kwargs): + cache_key = Cache._make_key(args, kwargs) - # Until maximumNetworkCards is not available, 1 is a safe value for all instance types - needed_interfaces = int(instance_info.get("NetworkInfo").get("MaximumNetworkCards", 1)) + if Cache.is_enabled() and cache_key in cache: + return cache[cache_key] + else: + return_value = function(*args, **kwargs) + if Cache.is_enabled(): + cache[cache_key] = return_value + return return_value - return needed_interfaces + return wrapper -def get_instance_gpus(instance_type, instance_info=None): - """Return the number of GPUs provided by the instance type.""" - if not instance_info: - instance_info = get_instance_type(instance_type) +class InstanceTypeInfo: + """Data object wrapping the result of a describe_instance_types call.""" + + def __init__(self, instance_type_data): + self.instance_type_data = instance_type_data + + @staticmethod + @Cache.cached + def init_from_instance_type(instance_type, exit_on_error=True): + """ + Init InstanceTypeInfo by performing a describe_instance_types call. + + Multiple calls for the same instance_type are cached. + The function exits with error if exit_on_error is set to True. + """ + try: + ec2_client = boto3.client("ec2") + return InstanceTypeInfo( + ec2_client.describe_instance_types(InstanceTypes=[instance_type]).get("InstanceTypes")[0] + ) + except ClientError as e: + error( + "Failed when retrieving instance type data for instance {0}: {1}".format( + instance_type, e.response.get("Error").get("Message") + ), + exit_on_error, + ) + + def gpu_count(self): + """Return the number of GPUs for the instance.""" + gpu_info = self.instance_type_data.get("GpuInfo", None) + # Currently adding up all gpus. To be reviewed if the case of heterogeneous GPUs arises. + gpus = sum([gpus.get("Count") for gpus in gpu_info.get("Gpus")]) if gpu_info else 0 + return gpus + + def max_network_interface_count(self): + """Max number of NICs for the instance.""" + needed_interfaces = int(self.instance_type_data.get("NetworkInfo").get("MaximumNetworkCards", 1)) + return needed_interfaces + + def default_threads_per_core(self): + """Return the default threads per core for the given instance type.""" + # NOTE: currently, .metal instances do not contain the DefaultThreadsPerCore + # attribute in their VCpuInfo section. This is a known issue with the + # ec2 DescribeInstanceTypes API. For these instance types an assumption + # is made that if the instance's supported architectures list includes + # x86_64 then the default is 2, otherwise it's 1. + threads_per_core = self.instance_type_data.get("VCpuInfo", {}).get("DefaultThreadsPerCore") + if threads_per_core is None: + supported_architectures = self.instance_type_data.get("ProcessorInfo", {}).get("SupportedArchitectures", []) + threads_per_core = 2 if "x86_64" in supported_architectures else 1 + return threads_per_core + + def vcpus_count(self): + """Get number of vcpus for the given instance type.""" + try: + vcpus_info = self.instance_type_data.get("VCpuInfo") + vcpus = vcpus_info.get("DefaultVCpus") + except ClientError: + vcpus = -1 - gpu_info = instance_info.get("GpuInfo", None) + return vcpus - # Currently adding up all gpus. To be reviewed if the case of heterogeneous GPUs arises. - gpus = sum([gpus.get("Count") for gpus in gpu_info.get("Gpus")]) if gpu_info else 0 + def supported_architecture(self): + """Return the list of supported architectures.""" + return self.instance_type_data.get("ProcessorInfo").get("SupportedArchitectures") - return gpus + def is_efa_supported(self): + """Check whether EFA is supported.""" + return self.instance_type_data.get("NetworkInfo").get("EfaSupported") diff --git a/cli/tests/pcluster/config/test_config_patch.py b/cli/tests/pcluster/config/test_config_patch.py index 3100dd83d8..fdefcfed74 100644 --- a/cli/tests/pcluster/config/test_config_patch.py +++ b/cli/tests/pcluster/config/test_config_patch.py @@ -16,6 +16,7 @@ from pcluster.config.config_patch import Change, ConfigPatch from pcluster.config.pcluster_config import PclusterConfig from pcluster.config.update_policy import UpdatePolicy +from pcluster.utils import InstanceTypeInfo from tests.pcluster.config.utils import duplicate_config_file default_cluster_params = { @@ -35,7 +36,16 @@ def _do_mocking_for_tests(mocker): mocker.patch( "pcluster.config.cfn_param_types.get_supported_architectures_for_instance_type", return_value=["x86_64"] ) - mocker.patch("pcluster.config.cfn_param_types.get_instance_network_interfaces", return_value=1) + mocker.patch( + "pcluster.config.cfn_param_types.InstanceTypeInfo.init_from_instance_type", + return_value=InstanceTypeInfo( + { + "InstanceType": "g4dn.metal", + "VCpuInfo": {"DefaultVCpus": 96, "DefaultCores": 48, "DefaultThreadsPerCore": 2}, + "NetworkInfo": {"EfaSupported": True, "MaximumNetworkCards": 1}, + } + ), + ) def _check_patch(src_conf, dst_conf, expected_changes, expected_patch_policy): diff --git a/cli/tests/pcluster/config/test_hit_converter.py b/cli/tests/pcluster/config/test_hit_converter.py index f034553b06..971fbc7606 100644 --- a/cli/tests/pcluster/config/test_hit_converter.py +++ b/cli/tests/pcluster/config/test_hit_converter.py @@ -14,8 +14,7 @@ from pcluster.cluster_model import ClusterModel from pcluster.config.hit_converter import HitConverter -from pcluster.utils import is_hit_enabled_scheduler -from tests.common import MockedBoto3Request +from pcluster.utils import InstanceTypeInfo from tests.pcluster.config.utils import init_pcluster_config_from_configparser @@ -170,25 +169,16 @@ def test_hit_converter(mocker, boto3_stubber, src_config_dict, dst_config_dict): scheduler = src_config_dict["cluster default"]["scheduler"] instance_type = src_config_dict["cluster default"]["compute_instance_type"] - mocker.patch("pcluster.config.cfn_param_types.get_instance_network_interfaces", return_value=1) - - if is_hit_enabled_scheduler(scheduler): - mocked_requests = [ - MockedBoto3Request( - method="describe_instance_types", - response={ - "InstanceTypes": [ - { - "InstanceType": instance_type, - "VCpuInfo": {"DefaultVCpus": 96, "DefaultCores": 48, "DefaultThreadsPerCore": 2}, - "NetworkInfo": {"EfaSupported": True}, - } - ] - }, - expected_params={"InstanceTypes": [instance_type]}, - ) - ] - boto3_stubber("ec2", mocked_requests) + mocker.patch( + "pcluster.config.cfn_param_types.InstanceTypeInfo.init_from_instance_type", + return_value=InstanceTypeInfo( + { + "InstanceType": instance_type, + "VCpuInfo": {"DefaultVCpus": 96, "DefaultCores": 48, "DefaultThreadsPerCore": 2}, + "NetworkInfo": {"EfaSupported": True, "MaximumNetworkCards": 1}, + } + ), + ) config_parser = configparser.ConfigParser() diff --git a/cli/tests/pcluster/config/test_json_param_types.py b/cli/tests/pcluster/config/test_json_param_types.py index 705f5db600..efa90eb1cd 100644 --- a/cli/tests/pcluster/config/test_json_param_types.py +++ b/cli/tests/pcluster/config/test_json_param_types.py @@ -114,6 +114,13 @@ } +@pytest.fixture(autouse=True) +def clear_cache(): + from pcluster.utils import Cache + + Cache.clear_all() + + @pytest.fixture() def boto3_stubber_path(): return "pcluster.utils.boto3" @@ -241,26 +248,23 @@ def _mock_boto3(boto3_stubber, expected_json_params, master_instance_type=None): """Mock the boto3 client based on the expected json configuration.""" expected_json_queue_settings = expected_json_params["cluster"].get("queue_settings", {}) mocked_requests = [] - + instance_types = [] # One describe_instance_type for the Master node if master_instance_type: - mocked_requests.append( - MockedBoto3Request( - method="describe_instance_types", - response=DESCRIBE_INSTANCE_TYPES_RESPONSES[master_instance_type], - expected_params={"InstanceTypes": [master_instance_type]}, - ) - ) + instance_types.append(master_instance_type) # One describe_instance_type per compute resource for _, queue in expected_json_queue_settings.items(): for _, compute_resource in queue.get("compute_resource_settings", {}).items(): - instance_type = compute_resource["instance_type"] - mocked_requests.append( - MockedBoto3Request( - method="describe_instance_types", - response=DESCRIBE_INSTANCE_TYPES_RESPONSES[instance_type], - expected_params={"InstanceTypes": [instance_type]}, - ) + if compute_resource["instance_type"] not in instance_types: + instance_types.append(compute_resource["instance_type"]) + + for instance_type in instance_types: + mocked_requests.append( + MockedBoto3Request( + method="describe_instance_types", + response=DESCRIBE_INSTANCE_TYPES_RESPONSES[instance_type], + expected_params={"InstanceTypes": [instance_type]}, ) + ) boto3_stubber("ec2", mocked_requests) diff --git a/cli/tests/pcluster/config/test_runtime.py b/cli/tests/pcluster/config/test_runtime.py index e2f9450aa9..ad28cd606d 100644 --- a/cli/tests/pcluster/config/test_runtime.py +++ b/cli/tests/pcluster/config/test_runtime.py @@ -20,7 +20,11 @@ def test_update_sections(mocker, pcluster_config_reader): mocker.patch( "pcluster.config.cfn_param_types.get_supported_architectures_for_instance_type", return_value=["x86_64"] ) - mocker.patch("pcluster.config.cfn_param_types.get_instance_network_interfaces", return_value=1) + instance_type_info_mock = mocker.MagicMock() + mocker.patch( + "pcluster.config.cfn_param_types.InstanceTypeInfo.init_from_instance_type", return_value=instance_type_info_mock + ) + instance_type_info_mock.max_network_interface_count.return_value = 1 pcluster_config = PclusterConfig( cluster_label="default", config_file=pcluster_config_reader(), fail_on_file_absence=True, fail_on_error=True ) diff --git a/cli/tests/pcluster/config/test_section_cluster.py b/cli/tests/pcluster/config/test_section_cluster.py index 4ce1257ee7..2ccde19064 100644 --- a/cli/tests/pcluster/config/test_section_cluster.py +++ b/cli/tests/pcluster/config/test_section_cluster.py @@ -804,8 +804,12 @@ def test_cluster_section_to_cfn( utils.set_default_values_for_required_cluster_section_params(section_dict) utils.mock_pcluster_config(mocker) mocker.patch("pcluster.config.cfn_param_types.get_efs_mount_target_id", return_value="valid_mount_target_id") - mocker.patch("pcluster.config.cfn_param_types.get_instance_vcpus", return_value=4) - mocker.patch("pcluster.config.cfn_param_types.get_default_threads_per_core", side_effect=default_threads_per_core) + instance_type_info_mock = mocker.MagicMock() + mocker.patch( + "pcluster.config.cfn_param_types.InstanceTypeInfo.init_from_instance_type", return_value=instance_type_info_mock + ) + instance_type_info_mock.vcpus_count.return_value = 4 + instance_type_info_mock.default_threads_per_core.side_effect = default_threads_per_core utils.assert_section_to_cfn(mocker, cluster_section_definition, section_dict, expected_cfn_params) @@ -1210,7 +1214,7 @@ def test_sit_cluster_from_file_to_cfn(mocker, pcluster_config_reader, settings_l side_effect=lambda subnet: "mocked_avail_zone" if subnet == "subnet-12345678" else "some_other_az", ) - mocker.patch("pcluster.config.cfn_param_types.get_instance_vcpus", return_value=2) + mocker.patch("pcluster.config.cfn_param_types.InstanceTypeInfo.vcpus_count", return_value=2) utils.assert_section_params(mocker, pcluster_config_reader, settings_label, expected_cfn_params) diff --git a/cli/tests/pcluster/config/test_source_consistency.py b/cli/tests/pcluster/config/test_source_consistency.py index 131dec3aa9..7ea08d5aa3 100644 --- a/cli/tests/pcluster/config/test_source_consistency.py +++ b/cli/tests/pcluster/config/test_source_consistency.py @@ -16,7 +16,6 @@ import tests.pcluster.config.utils as utils from pcluster.config.mappings import ALIASES, AWS, CLUSTER_SIT, CW_LOG, DCV, EBS, EFS, FSX, GLOBAL, RAID, SCALING, VPC -from pcluster.config.pcluster_config import PclusterConfig from tests.pcluster.config.defaults import CFN_CLI_RESERVED_PARAMS, CFN_SIT_CONFIG_NUM_OF_PARAMS, DefaultCfnParams EXISTING_SECTIONS = [ALIASES, AWS, CLUSTER_SIT, CW_LOG, DCV, EBS, EFS, FSX, GLOBAL, RAID, SCALING, VPC] @@ -68,23 +67,6 @@ def test_mapping_consistency(): ).is_not_none() -def test_example_config_consistency(mocker): - """Validate example file and try to convert to CFN.""" - mocker.patch("pcluster.config.cfn_param_types.get_availability_zone_of_subnet", return_value="mocked_avail_zone") - mocker.patch( - "pcluster.config.cfn_param_types.get_supported_architectures_for_instance_type", return_value=["x86_64"] - ) - mocker.patch("pcluster.config.cfn_param_types.get_instance_network_interfaces", return_value=1) - pcluster_config = PclusterConfig(config_file=utils.get_pcluster_config_example(), fail_on_file_absence=True) - - cfn_params = pcluster_config.to_cfn() - - assert_that(len(cfn_params)).is_equal_to(utils.get_cfn_config_num_of_params(pcluster_config)) - - # for param_key, param_value in expected_cfn_params.items(): - # assert_that(cfn_params.get(param_key)).is_equal_to(expected_cfn_params.get(param_key)) - - def test_defaults_consistency(): """Verifies that the defaults values for the CFN parameters used in the tests are the same in the CFN template.""" template_num_of_params = _get_pcluster_cfn_num_of_params() diff --git a/cli/tests/pcluster/config/test_utils.py b/cli/tests/pcluster/config/test_utils.py deleted file mode 100644 index 4fe7e0bf83..0000000000 --- a/cli/tests/pcluster/config/test_utils.py +++ /dev/null @@ -1,36 +0,0 @@ -# Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance -# with the License. A copy of the License is located at -# -# http://aws.amazon.com/apache2.0/ -# -# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES -# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and -# limitations under the License. -import pytest -from assertpy import assert_that - -from pcluster.utils import get_instance_vcpus -from tests.common import MockedBoto3Request - - -@pytest.fixture() -def boto3_stubber_path(): - return "pcluster.utils.boto3" - - -@pytest.mark.parametrize("valid_instance_type, expected_vcpus", [(True, 96), (False, -1)]) -def test_get_instance_vcpus(boto3_stubber, valid_instance_type, expected_vcpus): - instance_type = "g4dn.metal" - mocked_requests = [ - MockedBoto3Request( - method="describe_instance_types", - response={"InstanceTypes": [{"InstanceType": "g4dn.metal", "VCpuInfo": {"DefaultVCpus": 96}}]}, - expected_params={"InstanceTypes": [instance_type]}, - generate_error=not valid_instance_type, - ) - ] - - boto3_stubber("ec2", mocked_requests) - assert_that(get_instance_vcpus(instance_type)).is_equal_to(expected_vcpus) diff --git a/cli/tests/pcluster/config/test_validators.py b/cli/tests/pcluster/config/test_validators.py index 68d133a6ab..f4cd45d1fd 100644 --- a/cli/tests/pcluster/config/test_validators.py +++ b/cli/tests/pcluster/config/test_validators.py @@ -113,7 +113,9 @@ def test_head_node_instance_type_validator(mocker, instance_type, expected_messa def test_compute_instance_type_validator(mocker, scheduler, instance_type, expected_message, expected_warnings): config_parser_dict = {"cluster default": {"scheduler": scheduler, "compute_instance_type": instance_type}} extra_patches = { - "pcluster.config.validators.get_instance_network_interfaces": 4 if instance_type == "p4d.24xlarge" else 1, + "pcluster.config.validators.InstanceTypeInfo.max_network_interface_count": 4 + if instance_type == "p4d.24xlarge" + else 1, } utils.assert_param_validator( mocker, config_parser_dict, expected_message, expected_warnings, extra_patches=extra_patches @@ -2010,7 +2012,11 @@ def test_compute_resource_validator(mocker, section_dict, expected_message): mocker.patch( "pcluster.config.cfn_param_types.get_supported_architectures_for_instance_type", return_value=["x86_64"] ) - mocker.patch("pcluster.config.cfn_param_types.get_instance_network_interfaces", return_value=1) + instance_type_info_mock = mocker.MagicMock() + mocker.patch( + "pcluster.config.cfn_param_types.InstanceTypeInfo.init_from_instance_type", return_value=instance_type_info_mock + ) + instance_type_info_mock.max_network_interface_count.return_value = 1 mocker.patch("pcluster.config.validators.get_supported_architectures_for_instance_type", return_value=["x86_64"]) pcluster_config = utils.init_pcluster_config_from_configparser(config_parser, False) diff --git a/cli/tests/pcluster/config/utils.py b/cli/tests/pcluster/config/utils.py index fe654cc7aa..9b3fbd9ee9 100644 --- a/cli/tests/pcluster/config/utils.py +++ b/cli/tests/pcluster/config/utils.py @@ -22,6 +22,7 @@ from pcluster.config.cfn_param_types import CfnParam from pcluster.config.param_types import StorageData from pcluster.config.pcluster_config import PclusterConfig +from pcluster.utils import InstanceTypeInfo from tests.pcluster.config.defaults import CFN_HIT_CONFIG_NUM_OF_PARAMS, CFN_SIT_CONFIG_NUM_OF_PARAMS, DefaultDict # List of parameters ignored by default when comparing sections @@ -110,8 +111,12 @@ def get_mock_pcluster_config_patches(scheduler, extra_patches=None): "pcluster.config.validators.get_supported_architectures_for_instance_type": architectures, "pcluster.config.cfn_param_types.get_availability_zone_of_subnet": "mocked_avail_zone", "pcluster.config.cfn_param_types.get_supported_architectures_for_instance_type": architectures, - "pcluster.config.validators.get_instance_vcpus": 1, - "pcluster.config.cfn_param_types.get_instance_network_interfaces": 1, + "pcluster.config.cfn_param_types.InstanceTypeInfo.init_from_instance_type": InstanceTypeInfo( + { + "VCpuInfo": {"DefaultVCpus": 96, "DefaultCores": 48, "DefaultThreadsPerCore": 2}, + "NetworkInfo": {"EfaSupported": True, "MaximumNetworkCards": 1}, + } + ), } if extra_patches: patches = merge_dicts(patches, extra_patches) @@ -126,14 +131,16 @@ def mock_pcluster_config(mocker, scheduler=None, extra_patches=None, patch_funcs mocker.patch.object(PclusterConfig, "_PclusterConfig__test_configuration") -def mock_get_instance_type(mocker, instance_type="t2.micro"): +def mock_instance_type_info(mocker, instance_type="t2.micro"): mocker.patch( - "pcluster.utils.get_instance_type", - return_value={ - "InstanceType": instance_type, - "VCpuInfo": {"DefaultVCpus": 4, "DefaultCores": 2}, - "NetworkInfo": {"EfaSupported": False}, - }, + "pcluster.utils.InstanceTypeInfo.init_from_instance_type", + return_value=InstanceTypeInfo( + { + "InstanceType": instance_type, + "VCpuInfo": {"DefaultVCpus": 4, "DefaultCores": 2}, + "NetworkInfo": {"EfaSupported": False}, + } + ), ) @@ -174,7 +181,7 @@ def assert_param_validator( config_parser.read_dict(config_parser_dict) mock_pcluster_config(mocker, config_parser_dict.get("cluster default").get("scheduler"), extra_patches) - mock_get_instance_type(mocker) + mock_instance_type_info(mocker) if expected_error: with pytest.raises(SystemExit, match=expected_error): @@ -355,12 +362,14 @@ def assert_section_params(mocker, pcluster_config_reader, settings_label, expect "pcluster.config.cfn_param_types.get_supported_architectures_for_instance_type", return_value=["x86_64"] ) mocker.patch( - "pcluster.utils.get_instance_type", - return_value={ - "InstanceType": "t2.micro", - "VCpuInfo": {"DefaultVCpus": 1, "DefaultCores": 1, "DefaultThreadsPerCore": 1}, - "NetworkInfo": {"EfaSupported": False}, - }, + "pcluster.utils.InstanceTypeInfo.init_from_instance_type", + return_value=InstanceTypeInfo( + { + "InstanceType": "t2.micro", + "VCpuInfo": {"DefaultVCpus": 1, "DefaultCores": 1, "DefaultThreadsPerCore": 1}, + "NetworkInfo": {"EfaSupported": False}, + } + ), ) if isinstance(expected_cfn_params, SystemExit): with pytest.raises(SystemExit): diff --git a/cli/tests/pcluster/configure/test_pcluster_configure.py b/cli/tests/pcluster/configure/test_pcluster_configure.py index 3b2d26a977..367f4fd2ce 100644 --- a/cli/tests/pcluster/configure/test_pcluster_configure.py +++ b/cli/tests/pcluster/configure/test_pcluster_configure.py @@ -8,7 +8,7 @@ from pcluster.configure.easyconfig import configure from pcluster.configure.networking import NetworkConfiguration -from tests.pcluster.config.utils import mock_get_instance_type +from tests.pcluster.config.utils import mock_instance_type_info EASYCONFIG = "pcluster.configure.easyconfig." NETWORKING = "pcluster.configure.networking." @@ -303,7 +303,7 @@ def _mock_parallel_cluster_config(mocker): ) for instance_type in supported_instance_types: - mock_get_instance_type(mocker, instance_type) + mock_instance_type_info(mocker, instance_type) def _run_configuration(mocker, path, with_config=False, region=None): diff --git a/cli/tests/pcluster/createami/test_pcluster_createami.py b/cli/tests/pcluster/createami/test_pcluster_createami.py index 55bdbf9cac..2f83ea7443 100644 --- a/cli/tests/pcluster/createami/test_pcluster_createami.py +++ b/cli/tests/pcluster/createami/test_pcluster_createami.py @@ -30,7 +30,7 @@ def test_get_default_createami_instance_type( ): """Verify that the function to select default instance types for the createami command behaves as expected.""" instance_type_info_patch = mocker.patch( - "pcluster.createami.utils.get_instance_types_info", + "pcluster.createami.utils.InstanceTypeInfo.init_from_instance_type", side_effect=SystemExit(instance_info_err) if instance_info_err else None, ) logger_error_patch = mocker.patch("pcluster.createami.LOGGER.error") @@ -56,7 +56,7 @@ def test_get_default_createami_instance_type( with pytest.raises(SystemExit) as sysexit: createami._get_default_createami_instance_type(ami_architecture) assert_that(sysexit.value.code).is_not_equal_to(0) - instance_type_info_patch.assert_called_with([expected_default_instance_type], fail_on_error=True) + instance_type_info_patch.assert_called_with(expected_default_instance_type) if instance_unavailable_in_region: assert_that(logger_error_patch.call_count).is_equal_to(1) @@ -65,7 +65,7 @@ def test_get_default_createami_instance_type( assert_that(createami._get_default_createami_instance_type(ami_architecture)).is_equal_to( expected_default_instance_type ) - instance_type_info_patch.assert_called_with([expected_default_instance_type], fail_on_error=True) + instance_type_info_patch.assert_called_with(expected_default_instance_type) @pytest.mark.parametrize( diff --git a/cli/tests/pcluster/utils/test_pcluster_utils.py b/cli/tests/pcluster/test_utils.py similarity index 87% rename from cli/tests/pcluster/utils/test_pcluster_utils.py rename to cli/tests/pcluster/test_utils.py index d1d879c016..ac6d1191a4 100644 --- a/cli/tests/pcluster/utils/test_pcluster_utils.py +++ b/cli/tests/pcluster/test_utils.py @@ -11,7 +11,7 @@ from botocore.exceptions import ClientError, EndpointConnectionError import pcluster.utils as utils -from pcluster.utils import get_bucket_url +from pcluster.utils import Cache, get_bucket_url from tests.common import MockedBoto3Request FAKE_CLUSTER_NAME = "cluster_name" @@ -499,47 +499,6 @@ def test_get_info_for_amis(boto3_stubber, image_ids, response, error_message): assert_that(sysexit.value.code).is_not_equal_to(0) -@pytest.mark.parametrize( - "instance_types, error_message, fail_on_error", - [ - # Test when calling for single instance types - (["t2.micro"], None, None), - (["bad.instance.type"], "some error message", True), - (["bad.instance.type"], "some error message", False), - # Test when calling for multiple instance types - (["t2.micro", "t2.xlarge"], None, None), - (["a1.medium", "m6g.xlarge"], None, None), - (["bad.instance.type1", "bad.instance.type2"], "some error message", True), - (["bad.instance.type1", "bad.instance.type2"], "some error message", False), - ], -) -def test_get_instance_types_info(boto3_stubber, capsys, instance_types, error_message, fail_on_error): - """Verify that get_instance_types_info makes the expected API call.""" - response_dict = {"InstanceTypes": [{"InstanceType": instance_type} for instance_type in instance_types]} - mocked_requests = [ - MockedBoto3Request( - method="describe_instance_types", - response=response_dict if error_message is None else error_message, - expected_params={"InstanceTypes": instance_types}, - generate_error=error_message, - ) - ] - boto3_stubber("ec2", mocked_requests) - if error_message and fail_on_error: - full_error_message = "calling DescribeInstanceTypes for instances {0}: {1}".format( - ", ".join(instance_types), error_message - ) - with pytest.raises(SystemExit, match=full_error_message) as sysexit: - utils.get_instance_types_info(instance_types, fail_on_error) - assert_that(sysexit.value.code).is_not_equal_to(0) - elif error_message: - utils.get_instance_types_info(instance_types, fail_on_error) - assert_that(capsys.readouterr().out).matches(error_message) - else: - instance_types_info = utils.get_instance_types_info(instance_types, fail_on_error) - assert_that(instance_types_info).is_equal_to(response_dict.get("InstanceTypes")) - - @pytest.mark.parametrize( "instance_type, supported_architectures, error_message", [ @@ -552,8 +511,8 @@ def test_get_instance_types_info(boto3_stubber, capsys, instance_types, error_me def test_get_supported_architectures_for_instance_type(mocker, instance_type, supported_architectures, error_message): """Verify that get_supported_architectures_for_instance_type behaves as expected for various cases.""" get_instance_types_info_patch = mocker.patch( - "pcluster.utils.get_instance_types_info", - return_value=[{"ProcessorInfo": {"SupportedArchitectures": supported_architectures}}], + "pcluster.utils.InstanceTypeInfo.init_from_instance_type", + return_value=utils.InstanceTypeInfo({"ProcessorInfo": {"SupportedArchitectures": supported_architectures}}), ) observed_architectures = utils.get_supported_architectures_for_instance_type(instance_type) expected_architectures = list(set(supported_architectures) & set(["x86_64", "arm64"])) @@ -562,7 +521,7 @@ def test_get_supported_architectures_for_instance_type(mocker, instance_type, su if instance_type == "optimal": get_instance_types_info_patch.assert_not_called() else: - get_instance_types_info_patch.assert_called_with([instance_type]) + get_instance_types_info_patch.assert_called_with(instance_type) @pytest.mark.parametrize( @@ -1008,3 +967,140 @@ def test_get_ebs_snapshot_info(boto3_stubber, snapshot_id, raise_exceptions, err with pytest.raises(SystemExit, match=error_message) as sysexit: utils.get_ebs_snapshot_info(snapshot_id, raise_exceptions=raise_exceptions) assert_that(sysexit.value.code).is_not_equal_to(0) + + +@pytest.mark.cache +class TestCache: + invocations = [] + + @pytest.fixture(autouse=True) + def clear_cache(self): + utils.Cache.clear_all() + + @pytest.fixture(autouse=True) + def clear_invocations(self): + del self.invocations[:] + + @pytest.fixture + def disabled_cache(self): + os.environ["PCLUSTER_CACHE_DISABLED"] = "true" + yield + del os.environ["PCLUSTER_CACHE_DISABLED"] + + @staticmethod + @Cache.cached + def _cached_method_1(arg1, arg2): + TestCache.invocations.append((arg1, arg2)) + return arg1, arg2 + + @staticmethod + @Cache.cached + def _cached_method_2(arg1, arg2): + TestCache.invocations.append((arg1, arg2)) + return arg1, arg2 + + def test_cached_method(self): + for _ in range(0, 2): + assert_that(self._cached_method_1(1, 2)).is_equal_to((1, 2)) + assert_that(self._cached_method_2(1, 2)).is_equal_to((1, 2)) + assert_that(self._cached_method_1(2, 1)).is_equal_to((2, 1)) + assert_that(self._cached_method_1(1, arg2=2)).is_equal_to((1, 2)) + assert_that(self._cached_method_1(arg1=1, arg2=2)).is_equal_to((1, 2)) + + assert_that(self.invocations).is_length(5) + + def test_disabled_cache(self, disabled_cache): + assert_that(self._cached_method_1(1, 2)).is_equal_to((1, 2)) + assert_that(self._cached_method_1(1, 2)).is_equal_to((1, 2)) + + assert_that(self.invocations).is_length(2) + + def test_clear_all(self): + for _ in range(0, 2): + assert_that(self._cached_method_1(1, 2)).is_equal_to((1, 2)) + assert_that(self._cached_method_2(1, 2)).is_equal_to((1, 2)) + + Cache.clear_all() + + for _ in range(0, 2): + assert_that(self._cached_method_1(1, 2)).is_equal_to((1, 2)) + assert_that(self._cached_method_2(1, 2)).is_equal_to((1, 2)) + + assert_that(self.invocations).is_length(4) + + +class TestInstanceTypeInfo: + @pytest.fixture(autouse=True) + def clear_cache(self): + utils.Cache.clear_all() + + def test_init_from_instance_type(self, boto3_stubber): + mocked_requests = [ + MockedBoto3Request( + method="describe_instance_types", + response={ + "InstanceTypes": [ + { + "InstanceType": "c4.xlarge", + "VCpuInfo": {"DefaultVCpus": 4, "DefaultCores": 2, "DefaultThreadsPerCore": 2}, + "NetworkInfo": {"EfaSupported": False, "MaximumNetworkCards": 1}, + "ProcessorInfo": {"SupportedArchitectures": ["x86_64"]}, + } + ] + }, + expected_params={"InstanceTypes": ["c4.xlarge"]}, + ), + MockedBoto3Request( + method="describe_instance_types", + response={ + "InstanceTypes": [ + { + "InstanceType": "g4dn.metal", + "VCpuInfo": {"DefaultVCpus": 96}, + "GpuInfo": {"Gpus": [{"Name": "T4", "Manufacturer": "NVIDIA", "Count": 8}]}, + "NetworkInfo": {"EfaSupported": True, "MaximumNetworkCards": 4}, + "ProcessorInfo": {"SupportedArchitectures": ["x86_64"]}, + } + ] + }, + expected_params={"InstanceTypes": ["g4dn.metal"]}, + ), + ] + boto3_stubber("ec2", mocked_requests) + + for _ in range(0, 2): + c4_instance_info = utils.InstanceTypeInfo.init_from_instance_type("c4.xlarge") + g4dn_instance_info = utils.InstanceTypeInfo.init_from_instance_type("g4dn.metal") + + assert_that(c4_instance_info.gpu_count()).is_equal_to(0) + assert_that(c4_instance_info.max_network_interface_count()).is_equal_to(1) + assert_that(c4_instance_info.default_threads_per_core()).is_equal_to(2) + assert_that(c4_instance_info.vcpus_count()).is_equal_to(4) + assert_that(c4_instance_info.supported_architecture()).is_equal_to(["x86_64"]) + assert_that(c4_instance_info.is_efa_supported()).is_equal_to(False) + + assert_that(g4dn_instance_info.gpu_count()).is_equal_to(8) + assert_that(g4dn_instance_info.max_network_interface_count()).is_equal_to(4) + assert_that(g4dn_instance_info.default_threads_per_core()).is_equal_to(2) + assert_that(g4dn_instance_info.vcpus_count()).is_equal_to(96) + assert_that(g4dn_instance_info.supported_architecture()).is_equal_to(["x86_64"]) + assert_that(g4dn_instance_info.is_efa_supported()).is_equal_to(True) + + def test_init_from_instance_type_failure(self, boto3_stubber): + boto3_stubber( + "ec2", + 2 + * [ + MockedBoto3Request( + method="describe_instance_types", + expected_params={"InstanceTypes": ["g4dn.metal"]}, + generate_error=True, + response="Error message", + ) + ], + ) + error_message = "Failed when retrieving instance type data for instance g4dn.metal: Error message" + with pytest.raises(SystemExit, match=error_message): + utils.InstanceTypeInfo.init_from_instance_type("g4dn.metal") + + utils.InstanceTypeInfo.init_from_instance_type("g4dn.metal", exit_on_error=False) diff --git a/cli/tests/pcluster_config/test_pcluster_config_convert.py b/cli/tests/pcluster_config/test_pcluster_config_convert.py index 1b8c92e8cb..0523aba74f 100644 --- a/cli/tests/pcluster_config/test_pcluster_config_convert.py +++ b/cli/tests/pcluster_config/test_pcluster_config_convert.py @@ -57,7 +57,7 @@ def _convert_and_assert_file_content( mocker.patch("pcluster.config.cfn_param_types.get_availability_zone_of_subnet") mocker.patch("pcluster.config.cfn_param_types.get_supported_architectures_for_instance_type") - mocker.patch("pcluster.config.json_param_types.utils.get_instance_type") + mocker.patch("pcluster.config.json_param_types.utils.InstanceTypeInfo.init_from_instance_type") original_default_region = os.environ.get("AWS_DEFAULT_REGION") if original_default_region: From 61b07ded379423136f570667fe18c02d339e0281 Mon Sep 17 00:00:00 2001 From: Francesco De Martino Date: Tue, 24 Nov 2020 13:01:13 +0100 Subject: [PATCH 11/66] awsbatch: set head node IP in the JobDefinition environment This removes the need of calling CloudFormation API at every docker container launch. In order to do so a dependency on the head node substack has been introduced for the AWS Batch substack. This makes the cluster creation slower by around 40% when awsbatch is selected as the scheduler. Signed-off-by: Francesco De Martino --- CHANGELOG.md | 5 +++++ .../resources/batch/docker/scripts/entrypoint.sh | 13 +++---------- cloudformation/aws-parallelcluster.cfn.json | 6 ++++++ cloudformation/batch-substack.cfn.json | 16 ++++++++++++++++ 4 files changed, 30 insertions(+), 10 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 459b65ae25..0b9e251685 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,11 @@ CHANGELOG 2.10.1 ------ +**ENHANCEMENTS** + +- Remove CloudFormation DescribeStacks API call from AWS Batch Docker entrypoint. This removes the possibility of job + failures due to CloudFormation throttling. + 2.10.0 ------ diff --git a/cli/src/pcluster/resources/batch/docker/scripts/entrypoint.sh b/cli/src/pcluster/resources/batch/docker/scripts/entrypoint.sh index 110232148d..a372270253 100755 --- a/cli/src/pcluster/resources/batch/docker/scripts/entrypoint.sh +++ b/cli/src/pcluster/resources/batch/docker/scripts/entrypoint.sh @@ -9,16 +9,9 @@ echo "Starting ssh agents..." eval $(ssh-agent -s) && ssh-add ${SSHDIR}/id_rsa /usr/sbin/sshd -f /root/.ssh/sshd_config -h /root/.ssh/ssh_host_rsa_key -# get private Master IP -_master_ip="$(aws --region "${PCLUSTER_AWS_REGION}" cloudformation describe-stacks --stack-name "${PCLUSTER_STACK_NAME}" --query "Stacks[0].Outputs[?OutputKey=='MasterPrivateIP'].OutputValue" --output text)" -if [[ -z "${_master_ip}" ]]; then - echo "Error getting Master IP" - exit 1 -fi - # mount nfs echo "Mounting /home..." -/parallelcluster/bin/mount_nfs.sh "${_master_ip}" "/home" +/parallelcluster/bin/mount_nfs.sh "${PCLUSTER_MASTER_IP}" "/home" echo "Mounting shared file system..." ebs_shared_dirs=$(echo "${PCLUSTER_SHARED_DIRS}" | tr "," " ") @@ -27,7 +20,7 @@ for ebs_shared_dir in ${ebs_shared_dirs} do if [[ ${ebs_shared_dir} != "NONE" ]]; then # mount nfs - /parallelcluster/bin/mount_nfs.sh "${_master_ip}" "${ebs_shared_dir}" + /parallelcluster/bin/mount_nfs.sh "${PCLUSTER_MASTER_IP}" "${ebs_shared_dir}" fi done @@ -41,7 +34,7 @@ fi # mount RAID via nfs if [[ ${PCLUSTER_RAID_SHARED_DIR} != "NONE" ]]; then - /parallelcluster/bin/mount_nfs.sh "${_master_ip}" "${PCLUSTER_RAID_SHARED_DIR}" + /parallelcluster/bin/mount_nfs.sh "${PCLUSTER_MASTER_IP}" "${PCLUSTER_RAID_SHARED_DIR}" fi # create hostfile if mnp job diff --git a/cloudformation/aws-parallelcluster.cfn.json b/cloudformation/aws-parallelcluster.cfn.json index 2cab11c7d7..9e26a22046 100644 --- a/cloudformation/aws-parallelcluster.cfn.json +++ b/cloudformation/aws-parallelcluster.cfn.json @@ -2345,6 +2345,12 @@ }, "Architecture": { "Ref": "Architecture" + }, + "MasterPrivateIP": { + "Fn::GetAtt": [ + "MasterServerSubstack", + "Outputs.MasterPrivateIP" + ] } }, "TemplateURL": { diff --git a/cloudformation/batch-substack.cfn.json b/cloudformation/batch-substack.cfn.json index a8a4ba1616..4be334dd4d 100644 --- a/cloudformation/batch-substack.cfn.json +++ b/cloudformation/batch-substack.cfn.json @@ -95,6 +95,10 @@ "x86_64", "arm64" ] + }, + "MasterPrivateIP": { + "Description": "Private IP of the head node", + "Type": "String" } }, "Conditions": { @@ -525,6 +529,12 @@ "Value": { "Ref": "RAIDSharedDir" } + }, + { + "Name": "PCLUSTER_MASTER_IP", + "Value": { + "Ref": "MasterPrivateIP" + } } ] } @@ -589,6 +599,12 @@ "Value": { "Ref": "RAIDSharedDir" } + }, + { + "Name": "PCLUSTER_MASTER_IP", + "Value": { + "Ref": "MasterPrivateIP" + } } ] } From 5cccb80816f04ee92960ddb8d8f840a68c72da16 Mon Sep 17 00:00:00 2001 From: Enrico Usai Date: Fri, 27 Nov 2020 15:58:10 +0100 Subject: [PATCH 12/66] Fix createami test to not fail when custom_node is not specified if `custom_node` is not specified, the `env` variable was referenced before assignment. Signed-off-by: Enrico Usai --- tests/integration-tests/tests/createami/test_createami.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/integration-tests/tests/createami/test_createami.py b/tests/integration-tests/tests/createami/test_createami.py index f94572169e..36178048fa 100644 --- a/tests/integration-tests/tests/createami/test_createami.py +++ b/tests/integration-tests/tests/createami/test_createami.py @@ -49,6 +49,7 @@ def test_createami(region, os, instance, request, pcluster_config_reader, vpc_st # Custom Node # inject PARALLELCLUSTER_NODE_URL into packer environment custom_node = request.config.getoption("createami_custom_node_package") + env = None if custom_node: env = environ.copy() env["PARALLELCLUSTER_NODE_URL"] = custom_node From 2cad6aa79fbaf4427077e5f45c8c3cee68aaeda5 Mon Sep 17 00:00:00 2001 From: Francesco De Martino Date: Tue, 24 Nov 2020 12:50:18 +0100 Subject: [PATCH 13/66] awsbatch: use amazonlinux:2 base image rather than amazonlinux:latest This makes sure we always download the latest for Amazon Linux 2 Signed-off-by: Francesco De Martino --- cli/src/pcluster/resources/batch/docker/alinux2/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cli/src/pcluster/resources/batch/docker/alinux2/Dockerfile b/cli/src/pcluster/resources/batch/docker/alinux2/Dockerfile index 28ff1e5b6a..97b6642f47 100644 --- a/cli/src/pcluster/resources/batch/docker/alinux2/Dockerfile +++ b/cli/src/pcluster/resources/batch/docker/alinux2/Dockerfile @@ -1,4 +1,4 @@ -FROM amazonlinux:latest +FROM amazonlinux:2 ENV USER root From 0b677d3518b583419eb4f96e69124d02d7c64cac Mon Sep 17 00:00:00 2001 From: Francesco De Martino Date: Thu, 26 Nov 2020 15:16:04 +0100 Subject: [PATCH 14/66] awsbatch: download AmazonLinux image from ECR rather than Docker Hub Signed-off-by: Francesco De Martino --- CHANGELOG.md | 7 ++ .../resources/batch/docker/buildspec.yml | 6 +- .../batch/docker/pull-alinux-image.sh | 22 +++++ .../batch/docker/upload-docker-images.sh | 17 ++-- cloudformation/batch-substack.cfn.json | 82 +++++++++++++++++++ tests/integration-tests/clusters_factory.py | 13 +++ .../tests/schedulers/test_awsbatch.py | 29 ++++++- 7 files changed, 162 insertions(+), 14 deletions(-) create mode 100755 cli/src/pcluster/resources/batch/docker/pull-alinux-image.sh diff --git a/CHANGELOG.md b/CHANGELOG.md index 0b9e251685..d41fbb4aca 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,13 @@ CHANGELOG - Remove CloudFormation DescribeStacks API call from AWS Batch Docker entrypoint. This removes the possibility of job failures due to CloudFormation throttling. +**CHANGES** + +- Pull Amazon Linux Docker images from ECR when building docker image for `awsbatch` scheduler. This only applies to + images built for `x86` architecture. + +**BUG FIXES** + 2.10.0 ------ diff --git a/cli/src/pcluster/resources/batch/docker/buildspec.yml b/cli/src/pcluster/resources/batch/docker/buildspec.yml index 2d9c5a6bb5..02c84ef843 100644 --- a/cli/src/pcluster/resources/batch/docker/buildspec.yml +++ b/cli/src/pcluster/resources/batch/docker/buildspec.yml @@ -1,13 +1,9 @@ version: 0.2 phases: - install: - runtime-versions: - docker: 18 pre_build: commands: - - echo Logging in to Amazon ECR... - - $(aws ecr get-login --no-include-email --region $AWS_DEFAULT_REGION) + - sh ./pull-alinux-image.sh build: commands: - echo Build started on `date` diff --git a/cli/src/pcluster/resources/batch/docker/pull-alinux-image.sh b/cli/src/pcluster/resources/batch/docker/pull-alinux-image.sh new file mode 100755 index 0000000000..f07ba6a52f --- /dev/null +++ b/cli/src/pcluster/resources/batch/docker/pull-alinux-image.sh @@ -0,0 +1,22 @@ +#!/usr/bin/env bash +set -euxo pipefail + +pull_docker_image_from_ecr() { + echo "Pulling amazonlinux:2 image from ECR" + aws ecr get-login-password --region "${ALINUX_ECR_REGISTRY_REGION}" | docker login --username AWS --password-stdin "${ALINUX_ECR_REGISTRY}" || return 1 + docker pull "${ALINUX_ECR_REGISTRY}/amazonlinux:2" || return 1 + docker tag "${ALINUX_ECR_REGISTRY}/amazonlinux:2" amazonlinux:2 +} + +if [ "${IMAGE}" = "alinux" ] || [ "${IMAGE}" = "alinux2" ]; then + if [ "${ARCHITECTURE}" = "x86_64" ]; then + if pull_docker_image_from_ecr; then + echo "Successfully pulled Amazon Linux image from ECR" + else + echo "Failed when pulling amazonlinux:2 image from ECR. Falling back to Docker Hub" + docker pull amazonlinux:2 + fi + else + docker pull amazonlinux:2 + fi +fi diff --git a/cli/src/pcluster/resources/batch/docker/upload-docker-images.sh b/cli/src/pcluster/resources/batch/docker/upload-docker-images.sh index 1dd138b0d5..faddcbe468 100755 --- a/cli/src/pcluster/resources/batch/docker/upload-docker-images.sh +++ b/cli/src/pcluster/resources/batch/docker/upload-docker-images.sh @@ -1,17 +1,20 @@ #!/usr/bin/env bash -set -eu +set -euxo pipefail + +DOMAIN_SUFFIX="" +if [[ ${AWS_REGION} == cn-* ]]; then + DOMAIN_SUFFIX=".cn" +fi push_docker_image() { local image=$1 echo "Uploading image ${image}" - S3_SUFFIX="" - if [[ ${AWS_REGION} == cn-* ]]; then - S3_SUFFIX=".cn" - fi - docker tag "${IMAGE_REPO_NAME}:${image}" "${AWS_ACCOUNT_ID}.dkr.ecr.${AWS_REGION}.amazonaws.com${S3_SUFFIX}/${IMAGE_REPO_NAME}:${image}" - docker push "${AWS_ACCOUNT_ID}.dkr.ecr.${AWS_REGION}.amazonaws.com${S3_SUFFIX}/${IMAGE_REPO_NAME}:${image}" + docker tag "${IMAGE_REPO_NAME}:${image}" "${AWS_ACCOUNT_ID}.dkr.ecr.${AWS_REGION}.amazonaws.com${DOMAIN_SUFFIX}/${IMAGE_REPO_NAME}:${image}" + docker push "${AWS_ACCOUNT_ID}.dkr.ecr.${AWS_REGION}.amazonaws.com${DOMAIN_SUFFIX}/${IMAGE_REPO_NAME}:${image}" } +aws ecr get-login-password --region "${AWS_REGION}" | docker login --username AWS --password-stdin "${AWS_ACCOUNT_ID}.dkr.ecr.${AWS_REGION}.amazonaws.com${DOMAIN_SUFFIX}" + if [ -z "${IMAGE}" ]; then for file in $(find `pwd` -type f -name Dockerfile); do IMAGE_TAG=$(dirname "${file}" | xargs basename) diff --git a/cloudformation/batch-substack.cfn.json b/cloudformation/batch-substack.cfn.json index 4be334dd4d..728a38549e 100644 --- a/cloudformation/batch-substack.cfn.json +++ b/cloudformation/batch-substack.cfn.json @@ -1,4 +1,23 @@ { + "Mappings": { + "AmazonLinuxECR": { + "aws": { + "registry": "137112412989.dkr.ecr.us-east-1.amazonaws.com", + "region": "us-east-1", + "account": "137112412989" + }, + "aws-us-gov": { + "registry": "045324592363.dkr.ecr.us-gov-east-1.amazonaws.com", + "region": "us-gov-east-1", + "account": "045324592363" + }, + "aws-cn": { + "registry": "141808717104.dkr.ecr.cn-north-1.amazonaws.com.cn", + "region": "cn-north-1", + "account": "141808717104" + } + } + }, "Parameters": { "MinvCpus": { "Description": "Min vCPU's for ComputeEnvironment", @@ -655,11 +674,41 @@ "Ref": "OS" } }, + { + "Name": "ARCHITECTURE", + "Value": { + "Ref": "Architecture" + } + }, { "Name": "NOTIFICATION_URL", "Value": { "Ref": "DockerBuildWaitHandle" } + }, + { + "Name": "ALINUX_ECR_REGISTRY", + "Value": { + "Fn::FindInMap": [ + "AmazonLinuxECR", + { + "Ref": "AWS::Partition" + }, + "registry" + ] + } + }, + { + "Name": "ALINUX_ECR_REGISTRY_REGION", + "Value": { + "Fn::FindInMap": [ + "AmazonLinuxECR", + { + "Ref": "AWS::Partition" + }, + "region" + ] + } } ], "Image": { @@ -754,6 +803,39 @@ "Fn::Sub": "arn:${AWS::Partition}:s3:::${ResourcesS3Bucket}/${ArtifactS3RootDirectory}/*" }, "Sid": "S3GetObjectPolicy" + }, + { + "Action": [ + "ecr:BatchGetImage", + "ecr:GetDownloadUrlForLayer" + ], + "Effect": "Allow", + "Resource": { + "Fn::Sub": [ + "arn:${AWS::Partition}:ecr:${alinux_ecr_region}:${alinux_ecr_registry_account}:repository/amazonlinux", + { + "alinux_ecr_region": { + "Fn::FindInMap": [ + "AmazonLinuxECR", + { + "Ref": "AWS::Partition" + }, + "region" + ] + }, + "alinux_ecr_registry_account": { + "Fn::FindInMap": [ + "AmazonLinuxECR", + { + "Ref": "AWS::Partition" + }, + "account" + ] + } + } + ] + }, + "Sid": "AlinuxECRRepoPolicy" } ], "Version": "2012-10-17" diff --git a/tests/integration-tests/clusters_factory.py b/tests/integration-tests/clusters_factory.py index 35bc4aa8db..0015224d5c 100644 --- a/tests/integration-tests/clusters_factory.py +++ b/tests/integration-tests/clusters_factory.py @@ -35,6 +35,7 @@ def __init__(self, name, ssh_key, config_file): self.__cfn_resources = None self.__head_node_substack_cfn_resources = None self.__ebs_substack_cfn_resources = None + self.__awsbatch_substack_cfn_resources = None def __repr__(self): attrs = ", ".join(["{key}={value}".format(key=key, value=repr(value)) for key, value in self.__dict__.items()]) @@ -228,6 +229,18 @@ def ebs_substack_cfn_resources(self): ) return self.__ebs_substack_cfn_resources + @property + def awsbatch_substack_cfn_resources(self): + """ + Return the CloudFormation stack resources for the cluster's EBS substack. + Resources are retrieved only once and then cached. + """ + if not self.__awsbatch_substack_cfn_resources: + self.__awsbatch_substack_cfn_resources = retrieve_cfn_resources( + self.cfn_resources.get("AWSBatchStack"), self.region + ) + return self.__awsbatch_substack_cfn_resources + def _reset_cached_properties(self): """Discard cached data.""" self.__cfn_outputs = None diff --git a/tests/integration-tests/tests/schedulers/test_awsbatch.py b/tests/integration-tests/tests/schedulers/test_awsbatch.py index 50edd288a1..da2d6e5065 100644 --- a/tests/integration-tests/tests/schedulers/test_awsbatch.py +++ b/tests/integration-tests/tests/schedulers/test_awsbatch.py @@ -9,8 +9,10 @@ # or in the "LICENSE.txt" file accompanying this file. # This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied. # See the License for the specific language governing permissions and limitations under the License. +import json import logging +import boto3 import pytest from assertpy import assert_that from remote_command_executor import RemoteCommandExecutor @@ -25,8 +27,8 @@ @pytest.mark.dimensions("cn-north-1", "c4.xlarge", "alinux2", "awsbatch") @pytest.mark.dimensions("ap-southeast-1", "c5.xlarge", "alinux", "awsbatch") @pytest.mark.dimensions("ap-northeast-1", "m6g.xlarge", "alinux2", "awsbatch") -@pytest.mark.usefixtures("region", "os", "instance", "scheduler") -def test_awsbatch(pcluster_config_reader, clusters_factory, test_datadir, caplog, region): +@pytest.mark.usefixtures("instance", "scheduler") +def test_awsbatch(pcluster_config_reader, clusters_factory, test_datadir, caplog, region, os, architecture): """ Test all AWS Batch related features. @@ -39,12 +41,35 @@ def test_awsbatch(pcluster_config_reader, clusters_factory, test_datadir, caplog remote_command_executor = RemoteCommandExecutor(cluster) timeout = 120 if region.startswith("cn-") else 60 # Longer timeout in china regions due to less reliable networking + _assert_successful_codebuild_build(cluster, region, os, architecture) _test_simple_job_submission(remote_command_executor, test_datadir, timeout) _test_array_submission(remote_command_executor) _test_mnp_submission(remote_command_executor, test_datadir) _test_job_kill(remote_command_executor, timeout) +def _assert_successful_codebuild_build(cluster, region, os, architecture): + logging.info("Verifying docker build completed successfully.") + codebuild_project = cluster.awsbatch_substack_cfn_resources.get("CodeBuildDockerImageBuilderProject") + codebuild_client = boto3.client("codebuild", region_name=region) + logs_client = boto3.client("logs", region_name=region) + + build_ids = codebuild_client.list_builds_for_project(projectName=codebuild_project).get("ids") + assert_that(build_ids).is_length(1) + build = codebuild_client.batch_get_builds(ids=build_ids).get("builds")[0] + assert_that(build["buildStatus"]).is_equal_to("SUCCEEDED") + + # check Amazon Linux image is pulled from ECR + if os.startswith("alinux") and architecture == "x86_64": + response = logs_client.get_log_events( + logGroupName=build["logs"]["groupName"], + logStreamName=build["logs"]["streamName"], + limit=100, + startFromHead=True, + ) + assert_that(json.dumps(response)).contains("Successfully pulled Amazon Linux image from ECR") + + def _test_simple_job_submission(remote_command_executor, test_datadir, timeout): logging.info("Testing inline submission.") _test_job_submission(remote_command_executor, f"awsbsub --vcpus 2 --memory 256 --timeout {timeout} sleep 1") From 002c54ab394efa3cdeae9b9bd7659d536410227b Mon Sep 17 00:00:00 2001 From: Hanwen <68928867+hanwen-pcluste@users.noreply.github.com> Date: Tue, 1 Dec 2020 08:45:55 -0500 Subject: [PATCH 15/66] integ-tests: test using existing EFS (efs_fs_id) (#2213) This test uses `troposphere` to create cloudformation stacks for `efs`, `mount target`, and a instance to write an empty file with random name into the efs. Then the test verifies when the existing `efs` is provided through `efs_fs_id` in `pcluster` config file, the cluster created can read the randomly named file and share files between head node and compute node. Signed-off-by: Hanwen --- .../configs/common/common.yaml | 10 ++ tests/integration-tests/conftest.py | 15 +- .../tests/networking/test_networking.py | 8 +- .../tests/networking/test_security_groups.py | 8 +- .../tests/storage/test_efs.py | 148 +++++++++++++++++- .../test_existing_efs/pcluster.config.ini | 33 ++++ tests/integration-tests/utils.py | 14 +- 7 files changed, 205 insertions(+), 31 deletions(-) create mode 100644 tests/integration-tests/tests/storage/test_efs/test_existing_efs/pcluster.config.ini diff --git a/tests/integration-tests/configs/common/common.yaml b/tests/integration-tests/configs/common/common.yaml index 140a2e367b..2c3196c14b 100644 --- a/tests/integration-tests/configs/common/common.yaml +++ b/tests/integration-tests/configs/common/common.yaml @@ -409,6 +409,16 @@ storage: instances: {{ common.INSTANCES_DEFAULT_X86 }} oss: {{ common.OSS_BATCH }} schedulers: ["awsbatch"] + test_efs.py::test_existing_efs: + dimensions: + - regions: ["ap-northeast-2"] + instances: {{ common.INSTANCES_DEFAULT_X86 }} + oss: ["alinux2"] + schedulers: ["awsbatch"] + - regions: ["ap-northeast-2"] + instances: {{ common.INSTANCES_DEFAULT_X86 }} + oss: ["centos8"] + schedulers: ["slurm"] test_raid.py::test_raid_fault_tolerance_mode: dimensions: - regions: ["cn-northwest-1"] diff --git a/tests/integration-tests/conftest.py b/tests/integration-tests/conftest.py index a1070af0e7..a4f892a321 100644 --- a/tests/integration-tests/conftest.py +++ b/tests/integration-tests/conftest.py @@ -43,6 +43,7 @@ from utils import ( create_s3_bucket, delete_s3_bucket, + generate_stack_name, get_architecture_supported_by_instance_type, get_vpc_snakecase_value, random_alphanumeric, @@ -329,7 +330,7 @@ def test_datadir(request, datadir): @pytest.fixture() -def pcluster_config_reader(test_datadir, vpc_stacks, region, request): +def pcluster_config_reader(test_datadir, vpc_stack, region, request): """ Define a fixture to render pcluster config templates associated to the running test. @@ -349,7 +350,7 @@ def _config_renderer(config_file="pcluster.config.ini", **kwargs): config_file_path = test_datadir / config_file if not os.path.isfile(config_file_path): raise FileNotFoundError(f"Cluster config file not found in the expected dir {config_file_path}") - default_values = _get_default_template_values(vpc_stacks, region, request) + default_values = _get_default_template_values(vpc_stack, request) file_loader = FileSystemLoader(str(test_datadir)) env = Environment(loader=file_loader) rendered_template = env.get_template(config_file).render(**{**kwargs, **default_values}) @@ -441,9 +442,9 @@ def _enable_sanity_check_if_unset(cluster_config): config.write(f) -def _get_default_template_values(vpc_stacks, region, request): +def _get_default_template_values(vpc_stack, request): """Build a dictionary of default values to inject in the jinja templated cluster configs.""" - default_values = get_vpc_snakecase_value(region, vpc_stacks) + default_values = get_vpc_snakecase_value(vpc_stack) default_values.update({dimension: request.node.funcargs.get(dimension) for dimension in DIMENSIONS_MARKER_ARGS}) default_values["key_name"] = request.config.getoption("key_name") @@ -635,11 +636,7 @@ def _create_vpc_stack(request, template, region, cfn_stacks_factory): stack = CfnStack(name=request.config.getoption("vpc_stack"), region=region, template=template.to_json()) else: stack = CfnStack( - name="integ-tests-vpc-{0}{1}{2}".format( - random_alphanumeric(), - "-" if request.config.getoption("stackname_suffix") else "", - request.config.getoption("stackname_suffix"), - ), + name=generate_stack_name("integ-tests-vpc", request.config.getoption("stackname_suffix")), region=region, template=template.to_json(), ) diff --git a/tests/integration-tests/tests/networking/test_networking.py b/tests/integration-tests/tests/networking/test_networking.py index 5efb20c802..227ef3dc89 100644 --- a/tests/integration-tests/tests/networking/test_networking.py +++ b/tests/integration-tests/tests/networking/test_networking.py @@ -15,7 +15,7 @@ import pytest from assertpy import assert_that from cfn_stacks_factory import CfnStack, CfnStacksFactory -from utils import random_alphanumeric +from utils import generate_stack_name @pytest.fixture() @@ -26,11 +26,7 @@ def networking_stack_factory(request): def _create_network(region, template_path, parameters): file_content = extract_template(template_path) stack = CfnStack( - name="integ-tests-networking-{0}{1}{2}".format( - random_alphanumeric(), - "-" if request.config.getoption("stackname_suffix") else "", - request.config.getoption("stackname_suffix"), - ), + name=generate_stack_name("integ-tests-networking", request.config.getoption("stackname_suffix")), region=region, template=file_content, parameters=parameters, diff --git a/tests/integration-tests/tests/networking/test_security_groups.py b/tests/integration-tests/tests/networking/test_security_groups.py index 5c7e4481cb..3bc8c9d2d3 100644 --- a/tests/integration-tests/tests/networking/test_security_groups.py +++ b/tests/integration-tests/tests/networking/test_security_groups.py @@ -17,7 +17,7 @@ from cfn_stacks_factory import CfnStack from troposphere import Ref, Template from troposphere.ec2 import SecurityGroup, SecurityGroupIngress -from utils import check_headnode_security_group, random_alphanumeric +from utils import check_headnode_security_group, generate_stack_name @pytest.mark.usefixtures("os", "scheduler", "instance") @@ -118,11 +118,7 @@ def custom_security_group(vpc_stack, region, request, cfn_stacks_factory): ) ) stack = CfnStack( - name="integ-tests-custom-sg-{0}{1}{2}".format( - random_alphanumeric(), - "-" if request.config.getoption("stackname_suffix") else "", - request.config.getoption("stackname_suffix"), - ), + name=generate_stack_name("integ-tests-custom-sg", request.config.getoption("stackname_suffix")), region=region, template=template.to_json(), ) diff --git a/tests/integration-tests/tests/storage/test_efs.py b/tests/integration-tests/tests/storage/test_efs.py index 57fd9e7be2..fd4a5efee8 100644 --- a/tests/integration-tests/tests/storage/test_efs.py +++ b/tests/integration-tests/tests/storage/test_efs.py @@ -14,10 +14,15 @@ import boto3 import pytest from assertpy import assert_that +from cfn_stacks_factory import CfnStack from remote_command_executor import RemoteCommandExecutor -from utils import get_vpc_snakecase_value +from troposphere import Base64, Sub, Template +from troposphere.ec2 import Instance +from troposphere.efs import FileSystem, MountTarget +from utils import generate_stack_name, get_vpc_snakecase_value, random_alphanumeric from tests.common.schedulers_common import get_scheduler_commands +from tests.common.utils import retrieve_latest_ami from tests.storage.storage_common import verify_directory_correctly_shared @@ -28,13 +33,13 @@ @pytest.mark.schedulers(["slurm", "awsbatch"]) @pytest.mark.oss(["alinux2"]) @pytest.mark.usefixtures("region", "os", "instance") -def test_efs_compute_az(region, scheduler, pcluster_config_reader, clusters_factory, vpc_stacks): +def test_efs_compute_az(region, scheduler, pcluster_config_reader, clusters_factory, vpc_stack): """ Test when compute subnet is in a different AZ from master subnet. A compute mount target should be created and the efs correctly mounted on compute. """ - _assert_subnet_az_relations(region, vpc_stacks, expected_in_same_az=False) + _assert_subnet_az_relations(region, vpc_stack, expected_in_same_az=False) mount_dir = "efs_mount_dir" cluster_config = pcluster_config_reader(mount_dir=mount_dir) cluster = clusters_factory(cluster_config) @@ -50,13 +55,13 @@ def test_efs_compute_az(region, scheduler, pcluster_config_reader, clusters_fact @pytest.mark.instances(["c4.xlarge", "c5.xlarge"]) @pytest.mark.schedulers(["slurm", "awsbatch"]) @pytest.mark.usefixtures("region", "os", "instance") -def test_efs_same_az(region, scheduler, pcluster_config_reader, clusters_factory, vpc_stacks): +def test_efs_same_az(region, scheduler, pcluster_config_reader, clusters_factory, vpc_stack): """ Test when compute subnet is in the same AZ as master subnet. No compute mount point needed and the efs correctly mounted on compute. """ - _assert_subnet_az_relations(region, vpc_stacks, expected_in_same_az=True) + _assert_subnet_az_relations(region, vpc_stack, expected_in_same_az=True) mount_dir = "efs_mount_dir" cluster_config = pcluster_config_reader(mount_dir=mount_dir) cluster = clusters_factory(cluster_config) @@ -68,6 +73,135 @@ def test_efs_same_az(region, scheduler, pcluster_config_reader, clusters_factory _test_efs_correctly_shared(remote_command_executor, mount_dir, scheduler_commands) +@pytest.mark.usefixtures("os", "instance") +def test_existing_efs( + region, + scheduler, + efs_stack, + pcluster_config_reader, + clusters_factory, + vpc_stack, + request, + key_name, + cfn_stacks_factory, +): + """ + Test when efs_fs_id is provided in the config file, the existing efs can be correctly mounted. + + To verify the efs is the existing efs, the test expects a file with random ran inside the efs mounted + """ + file_name = _write_file_into_efs(region, vpc_stack, efs_stack, request, key_name, cfn_stacks_factory) + + _assert_subnet_az_relations(region, vpc_stack, expected_in_same_az=False) + mount_dir = "/efs_mount_dir" + cluster_config = pcluster_config_reader( + mount_dir=mount_dir, efs_fs_id=efs_stack.cfn_resources["FileSystemResource"] + ) + cluster = clusters_factory(cluster_config) + remote_command_executor = RemoteCommandExecutor(cluster) + + # test file in efs exist + logging.info("Testing efs {0} is correctly mounted".format(mount_dir)) + result = remote_command_executor.run_remote_command("df | grep '{0}'".format(mount_dir)) + assert_that(result.stdout).contains(mount_dir) + + remote_command_executor.run_remote_command(f"cat {mount_dir}/{file_name}") + scheduler_commands = get_scheduler_commands(scheduler, remote_command_executor) + _test_efs_correctly_mounted(remote_command_executor, mount_dir) + _test_efs_correctly_shared(remote_command_executor, mount_dir, scheduler_commands) + remote_command_executor.run_remote_command(f"cat {mount_dir}/{file_name}") + + +@pytest.fixture(scope="class") +def efs_stack(cfn_stacks_factory, request, region): + """EFS stack contains a single efs resource.""" + efs_template = Template() + efs_template.set_version("2010-09-09") + efs_template.set_description("EFS stack created for testing existing EFS") + efs_template.add_resource(FileSystem("FileSystemResource")) + stack = CfnStack( + name=generate_stack_name("integ-tests-efs", request.config.getoption("stackname_suffix")), + region=region, + template=efs_template.to_json(), + ) + cfn_stacks_factory.create_stack(stack) + + yield stack + + if not request.config.getoption("no_delete"): + cfn_stacks_factory.delete_stack(stack.name, region) + + +def _write_file_into_efs(region, vpc_stack, efs_stack, request, key_name, cfn_stacks_factory): + """Write file stack contains a mount target and a instance to write a empty file with random name into the efs.""" + write_file_template = Template() + write_file_template.set_version("2010-09-09") + write_file_template.set_description("Stack to write a file to the existing EFS") + default_security_group_id = ( + boto3.client("ec2", region_name=region) + .describe_security_groups( + Filters=[ + {"Name": "vpc-id", "Values": [vpc_stack.cfn_outputs["VpcId"]]}, + {"Name": "group-name", "Values": ["default"]}, + ] + ) + .get("SecurityGroups")[0] + .get("GroupId") + ) + write_file_template.add_resource( + MountTarget( + "MountTargetResource", + FileSystemId=efs_stack.cfn_resources["FileSystemResource"], + SubnetId=vpc_stack.cfn_outputs["PublicSubnetId"], + SecurityGroups=[default_security_group_id], + ) + ) + random_file_name = random_alphanumeric() + user_data = ( + """ + #cloud-config + package_update: true + package_upgrade: true + runcmd: + - yum install -y amazon-efs-utils + - yum install -y nfs-utils + - file_system_id_1=""" + + efs_stack.cfn_resources["FileSystemResource"] + + """ + - efs_mount_point_1=/mnt/efs/fs1 + - mkdir -p "${!efs_mount_point_1}" + - mount -t efs ${!file_system_id_1}:/ ${!efs_mount_point_1} + - touch ${!efs_mount_point_1}/""" + + random_file_name + + """ + - umount ${!efs_mount_point_1} + - opt/aws/bin/cfn-signal -e $? --stack ${AWS::StackName} --resource InstanceToWriteEFS --region ${AWS::Region} + """ + ) + write_file_template.add_resource( + Instance( + "InstanceToWriteEFS", + CreationPolicy={"ResourceSignal": {"Timeout": "PT10M"}}, + ImageId=retrieve_latest_ami(region, "alinux2"), + InstanceType="c5.xlarge", + SubnetId=vpc_stack.cfn_outputs["PublicSubnetId"], + UserData=Base64(Sub(user_data)), + KeyName=key_name, + DependsOn=["MountTargetResource"], + ) + ) + write_file_stack = CfnStack( + name=generate_stack_name("integ-tests-efs-write-file", request.config.getoption("stackname_suffix")), + region=region, + template=write_file_template.to_json(), + ) + cfn_stacks_factory.create_stack(write_file_stack) + + cfn_stacks_factory.delete_stack(write_file_stack.name, region) + + return random_file_name + + def _test_efs_correctly_shared(remote_command_executor, mount_dir, scheduler_commands): logging.info("Testing efs correctly mounted on compute nodes") verify_directory_correctly_shared(remote_command_executor, mount_dir, scheduler_commands) @@ -87,8 +221,8 @@ def _test_efs_correctly_mounted(remote_command_executor, mount_dir): ) -def _assert_subnet_az_relations(region, vpc_stacks, expected_in_same_az): - vpc = get_vpc_snakecase_value(region, vpc_stacks) +def _assert_subnet_az_relations(region, vpc_stack, expected_in_same_az): + vpc = get_vpc_snakecase_value(vpc_stack) master_subnet_id = vpc["public_subnet_id"] compute_subnet_id = vpc["private_subnet_id"] if expected_in_same_az else vpc["private_additional_cidr_subnet_id"] master_subnet_az = boto3.resource("ec2", region_name=region).Subnet(master_subnet_id).availability_zone diff --git a/tests/integration-tests/tests/storage/test_efs/test_existing_efs/pcluster.config.ini b/tests/integration-tests/tests/storage/test_efs/test_existing_efs/pcluster.config.ini new file mode 100644 index 0000000000..881335a66e --- /dev/null +++ b/tests/integration-tests/tests/storage/test_efs/test_existing_efs/pcluster.config.ini @@ -0,0 +1,33 @@ +[global] +cluster_template = default + +[aws] +aws_region_name = {{ region }} + +[cluster default] +base_os = {{ os }} +key_name = {{ key_name }} +vpc_settings = parallelcluster-vpc +scheduler = {{ scheduler }} +master_instance_type = {{ instance }} +compute_instance_type = {{ instance }} +{% if scheduler == "awsbatch" %} +min_vcpus = 4 +desired_vcpus = 4 +{% else %} +initial_queue_size = 1 +maintain_initial_size = true +{% endif %} +efs_settings = efs + +[vpc parallelcluster-vpc] +vpc_id = {{ vpc_id }} +master_subnet_id = {{ public_subnet_id }} +# This compute subnet would be in a different AZ than master for regions defined in AVAILABILITY_ZONE_OVERRIDES +# See conftest for details +compute_subnet_id = {{ private_additional_cidr_subnet_id }} +use_public_ips = false + +[efs efs] +efs_fs_id = {{ efs_fs_id }} +shared_dir = {{ mount_dir }} diff --git a/tests/integration-tests/utils.py b/tests/integration-tests/utils.py index d0712770c8..2cdb18196c 100644 --- a/tests/integration-tests/utils.py +++ b/tests/integration-tests/utils.py @@ -56,6 +56,15 @@ def run_command(command, capture_output=True, log_error=True, env=None, timeout= return result +def generate_stack_name(prefix, suffix): + """Generate a stack name with prefix, suffix, and a random string in the middle""" + return prefix + "-{0}{1}{2}".format( + random_alphanumeric(), + "-" if suffix else "", + suffix, + ) + + def random_alphanumeric(size=16): """Generate a random alphanumeric string.""" return "".join(random.choice(string.ascii_lowercase + string.digits) for _ in range(size)) @@ -304,11 +313,10 @@ def paginate_boto3(method, **kwargs): yield result -def get_vpc_snakecase_value(region, vpc_stacks): +def get_vpc_snakecase_value(vpc_stack): """Return dict containing snakecase vpc variables.""" vpc_output_dict = {} - vpc = vpc_stacks[region] - for key, value in vpc.cfn_outputs.items(): + for key, value in vpc_stack.cfn_outputs.items(): vpc_output_dict[to_snake_case(key)] = value return vpc_output_dict From c5687ad9fabd67de500b78c0c63a17c39ba9a10d Mon Sep 17 00:00:00 2001 From: chenwany Date: Fri, 27 Nov 2020 07:53:25 -0800 Subject: [PATCH 16/66] Add io2 volume type and integration test Add support for io2 volume type for EBS section and Raid section, add integration test to test different volume types Signed-off-by: chenwany --- CHANGELOG.md | 1 + cli/src/pcluster/config/mappings.py | 5 +- cli/src/pcluster/config/validators.py | 12 -- cli/tests/pcluster/config/test_section_ebs.py | 2 + cli/tests/pcluster/config/test_validators.py | 58 ++++--- cloudformation/aws-parallelcluster.cfn.json | 8 +- cloudformation/cw-dashboard-substack.cfn.yaml | 8 +- cloudformation/ebs-substack.cfn.json | 147 ++++++++++++++---- cloudformation/raid-substack.cfn.json | 145 +++++++++++++---- .../tests/storage/test_ebs.py | 46 +++++- .../test_ebs_multiple/pcluster.config.ini | 7 +- 11 files changed, 324 insertions(+), 115 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d41fbb4aca..dbbe6324c5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,7 @@ CHANGELOG - Remove CloudFormation DescribeStacks API call from AWS Batch Docker entrypoint. This removes the possibility of job failures due to CloudFormation throttling. +- Add support for io2 EBS volume type. **CHANGES** diff --git a/cli/src/pcluster/config/mappings.py b/cli/src/pcluster/config/mappings.py index 1fab9cdff8..480ed8ee88 100644 --- a/cli/src/pcluster/config/mappings.py +++ b/cli/src/pcluster/config/mappings.py @@ -95,7 +95,6 @@ maintain_initial_size_validator, queue_settings_validator, queue_validator, - raid_volume_iops_validator, s3_bucket_uri_validator, s3_bucket_validator, scheduler_validator, @@ -149,7 +148,7 @@ "snapshot_id": r"^snap-[0-9a-z]{8}$|^snap-[0-9a-z]{17}$", "subnet_id": r"^subnet-[0-9a-z]{8}$|^subnet-[0-9a-z]{17}$", "volume_id": r"^vol-[0-9a-z]{8}$|^vol-[0-9a-z]{17}$", - "volume_types": ["standard", "io1", "gp2", "st1", "sc1"], + "volume_types": ["standard", "io1", "io2", "gp2", "st1", "sc1"], "vpc_id": r"^vpc-[0-9a-z]{8}$|^vpc-[0-9a-z]{17}$", "fsx_deployment_type": ["SCRATCH_1", "SCRATCH_2", "PERSISTENT_1"], "fsx_ssd_throughput": FSX_SSD_THROUGHPUT, @@ -415,6 +414,7 @@ "type": CfnSection, "key": "raid", "default_label": "default", + "validators": [ebs_volume_type_size_validator, ebs_volume_iops_validator], "cfn_param_mapping": "RAIDOptions", # All the parameters in the section are converted into a single CFN parameter "params": OrderedDict( # Use OrderedDict because the parameters must respect the order in the CFN parameter [ @@ -448,7 +448,6 @@ ("volume_iops", { "type": IntCfnParam, "default": 100, - "validators": [raid_volume_iops_validator], "update_policy": UpdatePolicy.SUPPORTED }), ("encrypted", { diff --git a/cli/src/pcluster/config/validators.py b/cli/src/pcluster/config/validators.py index 6c37fd0e1a..acf90106c1 100644 --- a/cli/src/pcluster/config/validators.py +++ b/cli/src/pcluster/config/validators.py @@ -894,18 +894,6 @@ def efs_validator(section_key, section_label, pcluster_config): return errors, warnings -def raid_volume_iops_validator(param_key, param_value, pcluster_config): - errors = [] - warnings = [] - - raid_iops = float(param_value) - raid_vol_size = float(pcluster_config.get_section("raid").get_param_value("volume_size")) - if raid_iops > raid_vol_size * 50: - errors.append("IOPS to volume size ratio of {0} is too high; maximum is 50.".format(raid_iops / raid_vol_size)) - - return errors, warnings - - def scheduler_validator(param_key, param_value, pcluster_config): errors = [] warnings = [] diff --git a/cli/tests/pcluster/config/test_section_ebs.py b/cli/tests/pcluster/config/test_section_ebs.py index c6c4058dac..6bc0964839 100644 --- a/cli/tests/pcluster/config/test_section_ebs.py +++ b/cli/tests/pcluster/config/test_section_ebs.py @@ -69,6 +69,7 @@ def test_ebs_section_from_cfn(mocker, cfn_params_dict, expected_section_dict): ({"volume_type": "gp2"}, {"ebs default": {"volume_type": "gp2"}}, "No section"), # other values ({"volume_type": "io1"}, {"ebs default": {"volume_type": "io1"}}, None), + ({"volume_type": "io2"}, {"ebs default": {"volume_type": "io2"}}, None), ({"volume_size": 30}, {"ebs default": {"volume_size": "30"}}, None), ], ) @@ -155,6 +156,7 @@ def test_ebs_section_to_cfn(mocker, section_dict, expected_cfn_params): ("volume_type", None, "gp2", None), ("volume_type", "wrong_value", None, "Allowed values are"), ("volume_type", "io1", "io1", None), + ("volume_type", "io2", "io2", None), ("volume_type", "standard", "standard", None), ("volume_type", "NONE", None, "Allowed values are"), ("volume_size", None, None, None), diff --git a/cli/tests/pcluster/config/test_validators.py b/cli/tests/pcluster/config/test_validators.py index f4cd45d1fd..32b7dc81d9 100644 --- a/cli/tests/pcluster/config/test_validators.py +++ b/cli/tests/pcluster/config/test_validators.py @@ -686,15 +686,26 @@ def test_efs_validator(mocker, section_dict, expected_message): @pytest.mark.parametrize( "section_dict, expected_message", [ - # Testing iops validator - ({"volume_iops": 1, "volume_size": 1}, None), - ({"volume_iops": 51, "volume_size": 1}, "IOPS to volume size ratio of .* is too hig"), - ({"volume_iops": 1, "volume_size": 20}, None), - ({"volume_iops": 1001, "volume_size": 20}, "IOPS to volume size ratio of .* is too hig"), - # Testing shared_dir validator - ({"shared_dir": "NONE"}, "NONE cannot be used as a shared directory"), - ({"shared_dir": "/NONE"}, "/NONE cannot be used as a shared directory"), - ({"shared_dir": "/raid"}, None), + ({"volume_type": "io1", "volume_size": 20, "volume_iops": 120}, None), + ( + {"volume_type": "io1", "volume_size": 20, "volume_iops": 90}, + "IOPS rate must be between 100 and 64000 when provisioning io1 volumes.", + ), + ( + {"volume_type": "io1", "volume_size": 20, "volume_iops": 64001}, + "IOPS rate must be between 100 and 64000 when provisioning io1 volumes.", + ), + ({"volume_type": "io1", "volume_size": 20, "volume_iops": 1001}, "IOPS to volume size ratio of .* is too hig"), + ({"volume_type": "io2", "volume_size": 20, "volume_iops": 120}, None), + ( + {"volume_type": "io2", "volume_size": 20, "volume_iops": 90}, + "IOPS rate must be between 100 and 64000 when provisioning io2 volumes.", + ), + ( + {"volume_type": "io2", "volume_size": 20, "volume_iops": 64001}, + "IOPS rate must be between 100 and 64000 when provisioning io2 volumes.", + ), + ({"volume_type": "io2", "volume_size": 20, "volume_iops": 10001}, "IOPS to volume size ratio of .* is too hig"), ], ) def test_raid_validators(mocker, section_dict, expected_message): @@ -2421,10 +2432,9 @@ def test_fsx_ignored_parameters_validator(mocker, section_dict, expected_error): ({"volume_type": "io1", "volume_size": 15}, None), ({"volume_type": "io1", "volume_size": 3}, "The size of io1 volumes must be at least 4 GiB"), ({"volume_type": "io1", "volume_size": 16385}, "The size of io1 volumes can not exceed 16384 GiB"), - # TODO Uncomment these lines after adding support for io2 volume types - # ({"volume_type": "io2", "volume_size": 15}, None), - # ({"volume_type": "io2", "volume_size": 3}, "The size of io2 volumes must be at least 4 GiB"), - # ({"volume_type": "io2", "volume_size": 16385}, "The size of io2 volumes must be at most 16384 GiB"), + ({"volume_type": "io2", "volume_size": 15}, None), + ({"volume_type": "io2", "volume_size": 3}, "The size of io2 volumes must be at least 4 GiB"), + ({"volume_type": "io2", "volume_size": 16385}, "The size of io2 volumes can not exceed 16384 GiB"), ({"volume_type": "gp2", "volume_size": 15}, None), ({"volume_type": "gp2", "volume_size": 0}, "The size of gp2 volumes must be at least 1 GiB"), ({"volume_type": "gp2", "volume_size": 16385}, "The size of gp2 volumes can not exceed 16384 GiB"), @@ -2462,18 +2472,16 @@ def test_ebs_allowed_values_all_have_volume_size_bounds(): "IOPS rate must be between 100 and 64000 when provisioning io1 volumes.", ), ({"volume_type": "io1", "volume_size": 20, "volume_iops": 1001}, "IOPS to volume size ratio of .* is too hig"), - # TODO Uncomment these lines after adding support for io2 volume types - # ({"volume_type": "io2", "volume_size": 20, "volume_iops": 120}, None), - # ( - # {"volume_type": "io2", "volume_size": 20, "volume_iops": 90}, - # "IOPS rate must be between 100 and 64000 when provisioning io2 volumes.", - # ), - # ( - # {"volume_type": "io2", "volume_size": 20, "volume_iops": 64001}, - # "IOPS rate must be between 100 and 64000 when provisioning io2 volumes.", - # ), - # ({"volume_type": "io2", "volume_size": 20, "volume_iops": 10001}, - # "IOPS to volume size ratio of .* is too hig"), + ({"volume_type": "io2", "volume_size": 20, "volume_iops": 120}, None), + ( + {"volume_type": "io2", "volume_size": 20, "volume_iops": 90}, + "IOPS rate must be between 100 and 64000 when provisioning io2 volumes.", + ), + ( + {"volume_type": "io2", "volume_size": 20, "volume_iops": 64001}, + "IOPS rate must be between 100 and 64000 when provisioning io2 volumes.", + ), + ({"volume_type": "io2", "volume_size": 20, "volume_iops": 10001}, "IOPS to volume size ratio of .* is too hig"), ], ) def test_ebs_volume_iops_validator(mocker, section_dict, expected_message): diff --git a/cloudformation/aws-parallelcluster.cfn.json b/cloudformation/aws-parallelcluster.cfn.json index 9e26a22046..b287b0315a 100644 --- a/cloudformation/aws-parallelcluster.cfn.json +++ b/cloudformation/aws-parallelcluster.cfn.json @@ -68,8 +68,8 @@ "Description": "Comma delimited list of type of volume to create either new or from snapshot", "Type": "String", "Default": "gp2,gp2,gp2,gp2,gp2", - "ConstraintDescription": "must be a supported volume type: standard, io1, gp2, st1, sc1", - "AllowedPattern": "^(NONE|standard|io1|gp2|st1|sc1)((,|, )(NONE|standard|io1|gp2|st1|sc1)){4}$" + "ConstraintDescription": "must be a supported volume type: standard, io1, io2, gp2, st1, sc1", + "AllowedPattern": "^(NONE|standard|io1|io2|gp2|st1|sc1)((,|, )(NONE|standard|io1|io2|gp2|st1|sc1)){4}$" }, "MasterSubnetId": { "Description": "ID of the Subnet you want to provision the Master server into", @@ -122,7 +122,7 @@ ] }, "VolumeIOPS": { - "Description": "Comma delimited list of number of IOPS for volume type io1. Not used for other volume types.", + "Description": "Comma delimited list of number of IOPS for volume type io1 and io2. Not used for other volume types.", "Type": "String", "Default": "100,100,100,100,100" }, @@ -319,7 +319,7 @@ "Description": "Comma Separated List of RAID related options, 8 parameters in total, [shared_dir,raid_type,num_of_raid_volumes,volume_type,volume_size,volume_iops,encrypted,ebs_kms_key_id]", "Type": "String", "Default": "NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE", - "AllowedPattern": "^(NONE|.+)(,|, )(NONE|\\d)(,|, )(NONE|\\d)(,|, )(NONE|standard|io1|gp2|st1|sc1)(,|, )(NONE|\\d+)(,|, )(NONE|\\d+)(,|, )(NONE|true|false)(,|, )(NONE|.+)$" + "AllowedPattern": "^(NONE|.+)(,|, )(NONE|\\d)(,|, )(NONE|\\d)(,|, )(NONE|standard|io1|io2|gp2|st1|sc1)(,|, )(NONE|\\d+)(,|, )(NONE|\\d+)(,|, )(NONE|true|false)(,|, )(NONE|.+)$" }, "NumberOfEBSVol": { "Description": "Number of EBS Volumes the user requested, up to 5", diff --git a/cloudformation/cw-dashboard-substack.cfn.yaml b/cloudformation/cw-dashboard-substack.cfn.yaml index 155832ac48..e600e5d075 100644 --- a/cloudformation/cw-dashboard-substack.cfn.yaml +++ b/cloudformation/cw-dashboard-substack.cfn.yaml @@ -144,8 +144,8 @@ Resources: {%- endfor %} {#- Conditional EBS metrics #} - {%- set ebs_metrics_conditions = [{'metric': 'VolumeConsumedReadWriteOps', 'supported_vol_types': ["io1"], 'extra_params': ['"title":"Consumed Read/Write Ops"']}, - {'metric': 'VolumeThroughputPercentage', 'supported_vol_types': ["io1"], 'extra_params': ['"title":"Throughput Percentage"']}, + {%- set ebs_metrics_conditions = [{'metric': 'VolumeConsumedReadWriteOps', 'supported_vol_types': ["io1", "io2"], 'extra_params': ['"title":"Consumed Read/Write Ops"']}, + {'metric': 'VolumeThroughputPercentage', 'supported_vol_types': ["io1", "io2"], 'extra_params': ['"title":"Throughput Percentage"']}, {'metric': 'BurstBalance', 'supported_vol_types': ["gp2", "st1", "sc1"], 'extra_params': ['"title":"Burst Balance"']}] %} {%- for metric_condition_params in ebs_metrics_conditions %} @@ -207,8 +207,8 @@ Resources: {%- endfor %} {#- Conditional RAID metrics #} - {%- set raid_metrics_conditions_params = [{'metric': 'VolumeConsumedReadWriteOps', 'supported_vol_types': ["io1"], 'extra_params': ['"title":"Consumed Read/Write Ops"']}, - {'metric': 'VolumeThroughputPercentage', 'supported_vol_types': ["io1"], 'extra_params': ['"title":"Throughput Percentage"']}, + {%- set raid_metrics_conditions_params = [{'metric': 'VolumeConsumedReadWriteOps', 'supported_vol_types': ["io1" ,"io2"], 'extra_params': ['"title":"Consumed Read/Write Ops"']}, + {'metric': 'VolumeThroughputPercentage', 'supported_vol_types': ["io1", "io2"], 'extra_params': ['"title":"Throughput Percentage"']}, {'metric': 'BurstBalance', 'supported_vol_types': ["gp2", "st1", "sc1"], 'extra_params': ['"title":"Burst Balance"']}] %} {%- for metric_condition_params in raid_metrics_conditions_params %} diff --git a/cloudformation/ebs-substack.cfn.json b/cloudformation/ebs-substack.cfn.json index 58b66f04d5..f50663ebc0 100644 --- a/cloudformation/ebs-substack.cfn.json +++ b/cloudformation/ebs-substack.cfn.json @@ -120,16 +120,33 @@ ] }, "Vol1_UseEBSPIOPS": { - "Fn::Equals": [ + "Fn::Or": [ { - "Fn::Select": [ - "0", + "Fn::Equals": [ { - "Ref": "VolumeType" - } + "Fn::Select": [ + "0", + { + "Ref": "VolumeType" + } + ] + }, + "io1" ] }, - "io1" + { + "Fn::Equals": [ + { + "Fn::Select": [ + "0", + { + "Ref": "VolumeType" + } + ] + }, + "io2" + ] + } ] }, "Vol1_UseEBSSnapshot": { @@ -258,16 +275,33 @@ ] }, "Vol2_UseEBSPIOPS": { - "Fn::Equals": [ + "Fn::Or": [ { - "Fn::Select": [ - "1", + "Fn::Equals": [ { - "Ref": "VolumeType" - } + "Fn::Select": [ + "1", + { + "Ref": "VolumeType" + } + ] + }, + "io1" ] }, - "io1" + { + "Fn::Equals": [ + { + "Fn::Select": [ + "1", + { + "Ref": "VolumeType" + } + ] + }, + "io2" + ] + } ] }, "Vol2_UseEBSSnapshot": { @@ -396,16 +430,33 @@ ] }, "Vol3_UseEBSPIOPS": { - "Fn::Equals": [ + "Fn::Or": [ { - "Fn::Select": [ - "2", + "Fn::Equals": [ { - "Ref": "VolumeType" - } + "Fn::Select": [ + "2", + { + "Ref": "VolumeType" + } + ] + }, + "io1" ] }, - "io1" + { + "Fn::Equals": [ + { + "Fn::Select": [ + "2", + { + "Ref": "VolumeType" + } + ] + }, + "io2" + ] + } ] }, "Vol3_UseEBSSnapshot": { @@ -534,16 +585,33 @@ ] }, "Vol4_UseEBSPIOPS": { - "Fn::Equals": [ + "Fn::Or": [ { - "Fn::Select": [ - "3", + "Fn::Equals": [ { - "Ref": "VolumeType" - } + "Fn::Select": [ + "3", + { + "Ref": "VolumeType" + } + ] + }, + "io1" ] }, - "io1" + { + "Fn::Equals": [ + { + "Fn::Select": [ + "3", + { + "Ref": "VolumeType" + } + ] + }, + "io2" + ] + } ] }, "Vol4_UseEBSSnapshot": { @@ -672,16 +740,33 @@ ] }, "Vol5_UseEBSPIOPS": { - "Fn::Equals": [ + "Fn::Or": [ { - "Fn::Select": [ - "4", + "Fn::Equals": [ { - "Ref": "VolumeType" - } + "Fn::Select": [ + "4", + { + "Ref": "VolumeType" + } + ] + }, + "io1" ] }, - "io1" + { + "Fn::Equals": [ + { + "Fn::Select": [ + "4", + { + "Ref": "VolumeType" + } + ] + }, + "io2" + ] + } ] }, "Vol5_UseEBSSnapshot": { @@ -1072,7 +1157,7 @@ "Type": "Number" }, "VolumeIOPS": { - "Description": "Number of IOPS for volume type io1. Not used for other volume types.", + "Description": "Number of IOPS for volume type io1 and io2. Not used for other volume types.", "Type": "CommaDelimitedList" }, "VolumeSize": { diff --git a/cloudformation/raid-substack.cfn.json b/cloudformation/raid-substack.cfn.json index 006139de7f..41311f3578 100644 --- a/cloudformation/raid-substack.cfn.json +++ b/cloudformation/raid-substack.cfn.json @@ -151,16 +151,33 @@ ] }, "Vol1_UseEBSPIOPS": { - "Fn::Equals": [ + "Fn::Or": [ { - "Fn::Select": [ - "3", + "Fn::Equals": [ { - "Ref": "RAIDOptions" - } + "Fn::Select": [ + "3", + { + "Ref": "RAIDOptions" + } + ] + }, + "io1" ] }, - "io1" + { + "Fn::Equals": [ + { + "Fn::Select": [ + "3", + { + "Ref": "RAIDOptions" + } + ] + }, + "io2" + ] + } ] }, "Vol1_UseVolumeSize": { @@ -235,16 +252,33 @@ ] }, "Vol2_UseEBSPIOPS": { - "Fn::Equals": [ + "Fn::Or": [ { - "Fn::Select": [ - "3", + "Fn::Equals": [ { - "Ref": "RAIDOptions" - } + "Fn::Select": [ + "3", + { + "Ref": "RAIDOptions" + } + ] + }, + "io1" ] }, - "io1" + { + "Fn::Equals": [ + { + "Fn::Select": [ + "3", + { + "Ref": "RAIDOptions" + } + ] + }, + "io2" + ] + } ] }, "Vol2_UseVolumeSize": { @@ -319,16 +353,33 @@ ] }, "Vol3_UseEBSPIOPS": { - "Fn::Equals": [ + "Fn::Or": [ { - "Fn::Select": [ - "3", + "Fn::Equals": [ { - "Ref": "RAIDOptions" - } + "Fn::Select": [ + "3", + { + "Ref": "RAIDOptions" + } + ] + }, + "io1" ] }, - "io1" + { + "Fn::Equals": [ + { + "Fn::Select": [ + "3", + { + "Ref": "RAIDOptions" + } + ] + }, + "io2" + ] + } ] }, "Vol3_UseVolumeSize": { @@ -403,16 +454,33 @@ ] }, "Vol4_UseEBSPIOPS": { - "Fn::Equals": [ + "Fn::Or": [ { - "Fn::Select": [ - "3", + "Fn::Equals": [ { - "Ref": "RAIDOptions" - } + "Fn::Select": [ + "3", + { + "Ref": "RAIDOptions" + } + ] + }, + "io1" ] }, - "io1" + { + "Fn::Equals": [ + { + "Fn::Select": [ + "3", + { + "Ref": "RAIDOptions" + } + ] + }, + "io2" + ] + } ] }, "Vol4_UseVolumeSize": { @@ -487,16 +555,33 @@ ] }, "Vol5_UseEBSPIOPS": { - "Fn::Equals": [ + "Fn::Or": [ { - "Fn::Select": [ - "3", + "Fn::Equals": [ { - "Ref": "RAIDOptions" - } + "Fn::Select": [ + "3", + { + "Ref": "RAIDOptions" + } + ] + }, + "io1" ] }, - "io1" + { + "Fn::Equals": [ + { + "Fn::Select": [ + "3", + { + "Ref": "RAIDOptions" + } + ] + }, + "io2" + ] + } ] }, "Vol5_UseVolumeSize": { diff --git a/tests/integration-tests/tests/storage/test_ebs.py b/tests/integration-tests/tests/storage/test_ebs.py index f3ee4003d7..b425d23375 100644 --- a/tests/integration-tests/tests/storage/test_ebs.py +++ b/tests/integration-tests/tests/storage/test_ebs.py @@ -11,7 +11,9 @@ # See the License for the specific language governing permissions and limitations under the License. import logging +import boto3 import pytest +import utils from assertpy import assert_that from remote_command_executor import RemoteCommandExecutor @@ -73,19 +75,43 @@ def test_ebs_snapshot( @pytest.mark.dimensions("ca-central-1", "c5.xlarge", "alinux2", "awsbatch") @pytest.mark.dimensions("ca-central-1", "c5.xlarge", "ubuntu1804", "slurm") @pytest.mark.dimensions("eu-west-2", "c5.xlarge", "centos8", "slurm") -@pytest.mark.usefixtures("region", "os", "instance") -def test_ebs_multiple(scheduler, pcluster_config_reader, clusters_factory): +@pytest.mark.usefixtures("os", "instance") +def test_ebs_multiple(scheduler, pcluster_config_reader, clusters_factory, region): mount_dirs = ["/ebs_mount_dir_{0}".format(i) for i in range(0, 5)] volume_sizes = [15 + 5 * i for i in range(0, 5)] + + # for volume type sc1 and st1, the minimum volume sizes are 500G + volume_sizes[3] = 500 + volume_sizes[4] = 500 cluster_config = pcluster_config_reader(mount_dirs=mount_dirs, volume_sizes=volume_sizes) cluster = clusters_factory(cluster_config) remote_command_executor = RemoteCommandExecutor(cluster) scheduler_commands = get_scheduler_commands(scheduler, remote_command_executor) for mount_dir, volume_size in zip(mount_dirs, volume_sizes): - _test_ebs_correctly_mounted(remote_command_executor, mount_dir, volume_size) + # for volume size equal to 500G, the filesystem size is only about 492G + # This is because the file systems use some of the total space available on a device for storing internal + # structures and data (the file system's metadata). The overhead of the XFS filesystem is around 0.5%. + # If we test with small volume size(eg: 40G), the number is not large enough to show the gap between the + # partition size and the filesystem size. For sc1 and st1, the minimum size is 500G, so there will be a size + # difference. + _test_ebs_correctly_mounted( + remote_command_executor, mount_dir, volume_size if volume_size != 500 else "49[0-9]" + ) _test_ebs_correctly_shared(remote_command_executor, mount_dir, scheduler_commands) + volume_ids = get_ebs_volume_ids(cluster, region) + for i in range(len(volume_ids)): + # test different volume types + volume_type = cluster.config.get("ebs ebs{0}".format(i + 1), "volume_type") + volume = describe_volume(volume_ids[i], region) + assert_that(volume[0]).is_equal_to(volume_type) + # test different iops + # only the iops of io1 and io2 can be configured by us + if volume_type == "io1" or volume_type == "io2": + volume_iops = cluster.config.get("ebs ebs{0}".format(i + 1), "volume_iops") + assert_that(volume[1]).is_equal_to(int(volume_iops)) + @pytest.mark.dimensions("cn-northwest-1", "c4.xlarge", "alinux", "slurm") @pytest.mark.usefixtures("region", "os", "instance") @@ -175,6 +201,20 @@ def _test_ebs_resize(remote_command_executor, mount_dir, volume_size): assert_that(result.stdout).matches(r"{size}G".format(size=volume_size)) +def get_ebs_volume_ids(cluster, region): + # get the list of configured ebs volume ids + # example output: ['vol-000', 'vol-001', 'vol-002'] + ebs_stack = utils.get_substacks(cluster.cfn_name, region=region, sub_stack_name="EBSCfnStack")[0] + return utils.retrieve_cfn_outputs(ebs_stack, region).get("Volumeids").split(",") + + +def describe_volume(volume_id, region): + volume = boto3.client("ec2", region_name=region).describe_volumes(VolumeIds=[volume_id]).get("Volumes")[0] + volume_type = volume.get("VolumeType") + volume_iops = volume.get("Iops") + return volume_type, volume_iops + + @pytest.fixture() def snapshots_factory(): factory = EBSSnapshotsFactory() diff --git a/tests/integration-tests/tests/storage/test_ebs/test_ebs_multiple/pcluster.config.ini b/tests/integration-tests/tests/storage/test_ebs/test_ebs_multiple/pcluster.config.ini index 8abf07f99a..c2ff5023d7 100644 --- a/tests/integration-tests/tests/storage/test_ebs/test_ebs_multiple/pcluster.config.ini +++ b/tests/integration-tests/tests/storage/test_ebs/test_ebs_multiple/pcluster.config.ini @@ -36,21 +36,22 @@ encrypted = true [ebs ebs2] shared_dir = {{ mount_dirs[1] }} volume_size = {{ volume_sizes[1] }} -volume_iops = 125 +volume_type = gp2 encrypted = false [ebs ebs3] shared_dir = {{ mount_dirs[2] }} volume_size = {{ volume_sizes[2] }} volume_iops = 150 +volume_type = io2 [ebs ebs4] shared_dir = {{ mount_dirs[3] }} volume_size = {{ volume_sizes[3] }} +volume_type = sc1 [ebs ebs5] shared_dir = {{ mount_dirs[4] }} volume_size = {{ volume_sizes[4] }} -volume_type = io1 -volume_iops = 200 +volume_type = st1 encrypted = false From 790688d9562e00d6d5ed3801aabb62ec45d23ebc Mon Sep 17 00:00:00 2001 From: chenwany Date: Fri, 23 Oct 2020 17:01:15 -0700 Subject: [PATCH 17/66] integ-tests: test existing ebs volume Signed-off-by: chenwany --- .../configs/common/common.yaml | 6 +++ .../tests/storage/snapshots_factory.py | 26 ++++++++++- .../tests/storage/test_ebs.py | 43 +++++++++++++++++++ .../test_ebs_existing/pcluster.config.ini | 32 ++++++++++++++ 4 files changed, 106 insertions(+), 1 deletion(-) create mode 100644 tests/integration-tests/tests/storage/test_ebs/test_ebs_existing/pcluster.config.ini diff --git a/tests/integration-tests/configs/common/common.yaml b/tests/integration-tests/configs/common/common.yaml index 2c3196c14b..f51be906e7 100644 --- a/tests/integration-tests/configs/common/common.yaml +++ b/tests/integration-tests/configs/common/common.yaml @@ -479,6 +479,12 @@ storage: instances: {{ common.INSTANCES_DEFAULT_X86 }} oss: ["ubuntu1804"] schedulers: ["slurm"] + test_ebs.py::test_ebs_existing: + dimensions: + - regions: ["ap-northeast-2"] + instances: {{ common.INSTANCES_DEFAULT_X86 }} + oss: ["centos7"] + schedulers: ["sge"] tags: test_tag_propagation.py::test_tag_propagation: dimensions: diff --git a/tests/integration-tests/tests/storage/snapshots_factory.py b/tests/integration-tests/tests/storage/snapshots_factory.py index 3d43c6524d..6e9017a986 100644 --- a/tests/integration-tests/tests/storage/snapshots_factory.py +++ b/tests/integration-tests/tests/storage/snapshots_factory.py @@ -56,7 +56,29 @@ def create_snapshot(self, request, subnet_id, region): self.snapshot = self._create_snapshot(region, snapshot_config) return self.snapshot.id - def _create_snapshot(self, region, snapshot_config): + def create_existing_volume(self, request, subnet_id, region): + """ + Create a volume in a given region. + :param request: The current request + :param subnet_id: The subnet id where to get the snapshot + :param region: The region where to get the snapshot + """ + # Only one volume creation per factory allowed + if self.volume: + raise Exception("Volume already created") + + self.ec2 = boto3.resource("ec2", region_name=region) + self.boto_client = boto3.client("ec2", region_name=region) + volume_config = SnapshotConfig( + request.config.getoption("key_path"), + request.config.getoption("key_name"), + self.ec2.Subnet(subnet_id).vpc_id, + subnet_id, + ) + self._create_volume_process(region, volume_config) + return self.volume.id + + def _create_volume_process(self, region, snapshot_config): self.config = snapshot_config ami_id = self._get_amazonlinux_ami() @@ -77,6 +99,8 @@ def _create_snapshot(self, region, snapshot_config): # Stops the instance before taking the snapshot self._release_instance() + def _create_snapshot(self, region, snapshot_config): + self._create_volume_process(region, snapshot_config) self.snapshot = self._create_volume_snapshot() return self.snapshot diff --git a/tests/integration-tests/tests/storage/test_ebs.py b/tests/integration-tests/tests/storage/test_ebs.py index b425d23375..495baaf2b5 100644 --- a/tests/integration-tests/tests/storage/test_ebs.py +++ b/tests/integration-tests/tests/storage/test_ebs.py @@ -139,6 +139,42 @@ def test_ebs_single_empty(scheduler, pcluster_config_reader, clusters_factory): _test_ebs_correctly_shared(remote_command_executor, mount_dir, scheduler_commands) +@pytest.mark.dimensions("ap-northeast-2", "c5.xlarge", "centos7", "sge") +@pytest.mark.usefixtures("os", "instance") +def test_ebs_existing( + request, vpc_stacks, region, scheduler, pcluster_config_reader, clusters_factory, snapshots_factory +): + logging.info("Testing ebs existing") + existing_mount_dir = "existing_mount_dir" + + logging.info("Creating volume") + + volume_id = snapshots_factory.create_existing_volume( + request, vpc_stacks[region].cfn_outputs["PublicSubnetId"], region + ) + + logging.info("Existing Volume id: %s" % volume_id) + cluster_config = pcluster_config_reader( + volume_id=volume_id, + existing_mount_dir=existing_mount_dir, + ) + + cluster = clusters_factory(cluster_config) + remote_command_executor = RemoteCommandExecutor(cluster) + scheduler_commands = get_scheduler_commands(scheduler, remote_command_executor) + existing_mount_dir = "/" + existing_mount_dir + _test_ebs_correctly_mounted(remote_command_executor, existing_mount_dir, volume_size="9.8") + _test_ebs_correctly_shared(remote_command_executor, existing_mount_dir, scheduler_commands) + # Checks for test data + result = remote_command_executor.run_remote_command("cat {}/test.txt".format(existing_mount_dir)) + assert_that(result.stdout.strip()).is_equal_to("hello world") + + # delete the cluster before detaching the EBS volume + cluster.delete() + # check the volume still exists after deleting the cluster + _assert_volume_exist(volume_id, region) + + def _test_ebs_correctly_mounted(remote_command_executor, mount_dir, volume_size): logging.info("Testing ebs {0} is correctly mounted".format(mount_dir)) result = remote_command_executor.run_remote_command( @@ -215,6 +251,13 @@ def describe_volume(volume_id, region): return volume_type, volume_iops +def _assert_volume_exist(volume_id, region): + volume_status = ( + boto3.client("ec2", region_name=region).describe_volumes(VolumeIds=[volume_id]).get("Volumes")[0].get("State") + ) + assert_that(volume_status).is_equal_to("available") + + @pytest.fixture() def snapshots_factory(): factory = EBSSnapshotsFactory() diff --git a/tests/integration-tests/tests/storage/test_ebs/test_ebs_existing/pcluster.config.ini b/tests/integration-tests/tests/storage/test_ebs/test_ebs_existing/pcluster.config.ini new file mode 100644 index 0000000000..c579ccfcba --- /dev/null +++ b/tests/integration-tests/tests/storage/test_ebs/test_ebs_existing/pcluster.config.ini @@ -0,0 +1,32 @@ +[global] +cluster_template = default + +[aws] +aws_region_name = {{ region }} + +[cluster default] +base_os = {{ os }} +key_name = {{ key_name }} +vpc_settings = parallelcluster-vpc +scheduler = {{ scheduler }} +master_instance_type = {{ instance }} +compute_instance_type = {{ instance }} +{% if scheduler == "awsbatch" %} +min_vcpus = 1 +desired_vcpus = 1 +{% else %} +initial_queue_size = 1 +maintain_initial_size = false +{% endif %} +ebs_settings = ebs + +[vpc parallelcluster-vpc] +vpc_id = {{ vpc_id }} +master_subnet_id = {{ public_subnet_id }} +compute_subnet_id = {{ private_subnet_id }} +use_public_ips = false + +[ebs ebs] +ebs_volume_id = {{ volume_id }} +shared_dir = {{ existing_mount_dir }} +volume_type = gp2 From 0c33cb890e2bd72fdbcb2b5b56f723bdb1a5e82f Mon Sep 17 00:00:00 2001 From: Enrico Usai Date: Thu, 26 Nov 2020 17:45:16 +0100 Subject: [PATCH 18/66] Rename internal variables and functions from master_* to head_node_* Signed-off-by: Enrico Usai --- cli/src/awsbatch/awsbsub.py | 2 +- cli/src/awsbatch/common.py | 6 +- cli/src/pcluster/cluster_model.py | 6 +- cli/src/pcluster/commands.py | 30 +++---- cli/src/pcluster/config/cfn_param_types.py | 46 +++++------ cli/src/pcluster/config/mappings.py | 4 +- cli/src/pcluster/config/pcluster_config.py | 4 +- cli/src/pcluster/config/update_policy.py | 6 +- cli/src/pcluster/config/validators.py | 24 +++--- cli/src/pcluster/configure/easyconfig.py | 36 ++++---- cli/src/pcluster/configure/networking.py | 6 +- cli/src/pcluster/dcv/connect.py | 14 ++-- .../pcluster/models/hit/hit_cluster_model.py | 33 ++++---- .../pcluster/models/sit/sit_cluster_model.py | 35 ++++---- .../batch/docker/scripts/mount_nfs.sh | 24 +++--- cli/src/pcluster/utils.py | 20 ++--- .../pcluster/config/test_json_param_types.py | 12 +-- cli/tests/pcluster/config/test_section_efs.py | 3 +- cli/tests/pcluster/config/test_validators.py | 6 +- cli/tests/pcluster/config/utils.py | 6 +- .../configure/test_pcluster_configure.py | 80 +++++++++--------- cli/tests/pcluster/test_utils.py | 18 ++-- cloudformation/aws-parallelcluster.cfn.json | 2 +- tests/integration-tests/clusters_factory.py | 2 +- .../remote_command_executor.py | 4 +- .../test_cloudwatch_logging.py | 82 +++++++++---------- .../tests/create/test_create.py | 2 +- tests/integration-tests/tests/dcv/test_dcv.py | 4 +- .../tests/multiple_nics/test_multiple_nics.py | 12 +-- .../tests/storage/snapshots_factory.py | 4 +- .../tests/storage/storage_common.py | 8 +- .../tests/storage/test_efs.py | 8 +- .../generate-efs-substack.py | 16 ++-- 33 files changed, 284 insertions(+), 281 deletions(-) diff --git a/cli/src/awsbatch/awsbsub.py b/cli/src/awsbatch/awsbsub.py index ea67dfdafa..5c2f9f7620 100644 --- a/cli/src/awsbatch/awsbsub.py +++ b/cli/src/awsbatch/awsbsub.py @@ -566,7 +566,7 @@ def main(): retry_attempts=args.retry_attempts, timeout=args.timeout, env=[ - ("MASTER_IP", config.master_ip), # TODO remove + ("MASTER_IP", config.head_node_ip), # TODO remove ("PCLUSTER_JOB_S3_URL", "s3://{0}/{1}".format(config.s3_bucket, job_s3_folder)), ], ) diff --git a/cli/src/awsbatch/common.py b/cli/src/awsbatch/common.py index f9bcb41a58..022ffdb3a8 100644 --- a/cli/src/awsbatch/common.py +++ b/cli/src/awsbatch/common.py @@ -178,7 +178,7 @@ def __verify_initialization(self, log): log.debug("compute_environment = %s", self.compute_environment) log.debug("job_queue = %s", self.job_queue) log.debug("job_definition = %s", self.job_definition) - log.debug("master_ip = %s", self.master_ip) + log.debug("master_ip = %s", self.head_node_ip) log.info(self) except AttributeError as e: fail( @@ -261,7 +261,7 @@ def __init_from_config(self, cli_config_file, cluster, log): # noqa: C901 FIXME self.job_definition_mnp = config.get(cluster_section, "job_definition_mnp") except NoOptionError: pass - self.master_ip = config.get(cluster_section, "master_ip") + self.head_node_ip = config.get(cluster_section, "master_ip") # get proxy self.proxy = config.get(cluster_section, "proxy") @@ -316,7 +316,7 @@ def __init_from_stack(self, cluster, log): # noqa: C901 FIXME elif output_key == "BatchJobDefinitionArn": self.job_definition = output_value elif output_key == "MasterPrivateIP": - self.master_ip = output_value + self.head_node_ip = output_value elif output_key == "BatchJobDefinitionMnpArn": self.job_definition_mnp = output_value diff --git a/cli/src/pcluster/cluster_model.py b/cli/src/pcluster/cluster_model.py index e0e601cff4..e35bfbb735 100644 --- a/cli/src/pcluster/cluster_model.py +++ b/cli/src/pcluster/cluster_model.py @@ -133,11 +133,11 @@ def _get_latest_alinux_ami_id(self): def public_ips_in_compute_subnet(self, pcluster_config, network_interfaces_count): """Tell if public IPs will be used in compute subnet.""" vpc_section = pcluster_config.get_section("vpc") - master_subnet_id = vpc_section.get_param_value("master_subnet_id") + head_node_subnet_id = vpc_section.get_param_value("master_subnet_id") compute_subnet_id = vpc_section.get_param_value("compute_subnet_id") use_public_ips = vpc_section.get_param_value("use_public_ips") and ( - # For single NIC instances we check only if subnet is the same of master node - (not compute_subnet_id or compute_subnet_id == master_subnet_id) + # For single NIC instances we check only if subnet is the same of head node + (not compute_subnet_id or compute_subnet_id == head_node_subnet_id) # For multiple NICs instances we check also if subnet is different # to warn users about the current lack of support for public IPs or (network_interfaces_count > 1) diff --git a/cli/src/pcluster/commands.py b/cli/src/pcluster/commands.py index 3a2dc105da..5e5fe1fd68 100644 --- a/cli/src/pcluster/commands.py +++ b/cli/src/pcluster/commands.py @@ -424,33 +424,33 @@ def list_stacks(args): sys.exit(0) -def _poll_master_server_state(stack_name): +def _poll_head_node_state(stack_name): ec2 = boto3.client("ec2") try: - instances = utils.describe_cluster_instances(stack_name, node_type=utils.NodeType.master) + instances = utils.describe_cluster_instances(stack_name, node_type=utils.NodeType.head_node) if not instances: LOGGER.error("Cannot retrieve master node status. Exiting...") sys.exit(1) - master_id = instances[0].get("InstanceId") + head_node_id = instances[0].get("InstanceId") state = instances[0].get("State").get("Name") sys.stdout.write("\rMasterServer: %s" % state.upper()) sys.stdout.flush() while state not in ["running", "stopped", "terminated", "shutting-down"]: time.sleep(5) state = ( - ec2.describe_instance_status(InstanceIds=[master_id]) + ec2.describe_instance_status(InstanceIds=[head_node_id]) .get("InstanceStatuses")[0] .get("InstanceState") .get("Name") ) - master_status = "\r\033[KMasterServer: %s" % state.upper() - sys.stdout.write(master_status) + head_node_status = "\r\033[KMasterServer: %s" % state.upper() + sys.stdout.write(head_node_status) sys.stdout.flush() if state in ["terminated", "shutting-down"]: LOGGER.info("State: %s is irrecoverable. Cluster needs to be re-created.", state) sys.exit(1) - master_status = "\rMasterServer: %s\n" % state.upper() - sys.stdout.write(master_status) + head_node_status = "\rMasterServer: %s\n" % state.upper() + sys.stdout.write(head_node_status) sys.stdout.flush() except ClientError as e: LOGGER.critical(e.response.get("Error").get("Message")) @@ -475,9 +475,9 @@ def instances(args): scheduler = utils.get_cfn_param(cfn_stack.get("Parameters"), "Scheduler") instances = [] - master_server = utils.describe_cluster_instances(stack_name, node_type=utils.NodeType.master) - if master_server: - instances.append(("MasterServer", master_server[0].get("InstanceId"))) + head_node_server = utils.describe_cluster_instances(stack_name, node_type=utils.NodeType.head_node) + if head_node_server: + instances.append(("MasterServer", head_node_server[0].get("InstanceId"))) if scheduler != "awsbatch": instances.extend(_get_compute_instances(stack_name)) @@ -491,7 +491,7 @@ def instances(args): def ssh(args, extra_args): # noqa: C901 FIXME!!! """ - Execute an SSH command to the master instance, according to the [aliases] section if there. + Execute an SSH command to the head node instance, according to the [aliases] section if there. :param args: pcluster CLI args :param extra_args: pcluster CLI extra_args @@ -504,7 +504,7 @@ def ssh(args, extra_args): # noqa: C901 FIXME!!! ssh_command = "ssh {CFN_USER}@{MASTER_IP} {ARGS}" try: - master_ip, username = utils.get_master_ip_and_username(args.cluster_name) + head_node_ip, username = utils.get_head_node_ip_and_username(args.cluster_name) try: from shlex import quote as cmd_quote except ImportError: @@ -512,7 +512,7 @@ def ssh(args, extra_args): # noqa: C901 FIXME!!! # build command cmd = ssh_command.format( - CFN_USER=username, MASTER_IP=master_ip, ARGS=" ".join(cmd_quote(str(arg)) for arg in extra_args) + CFN_USER=username, MASTER_IP=head_node_ip, ARGS=" ".join(cmd_quote(str(arg)) for arg in extra_args) ) # run command @@ -558,7 +558,7 @@ def status(args): # noqa: C901 FIXME!!! sys.stdout.write("\rStatus: %s\n" % stack.get("StackStatus")) sys.stdout.flush() if stack.get("StackStatus") in ["CREATE_COMPLETE", "UPDATE_COMPLETE", "UPDATE_ROLLBACK_COMPLETE"]: - state = _poll_master_server_state(stack_name) + state = _poll_head_node_state(stack_name) if state == "running": _print_stack_outputs(stack) _print_compute_fleet_status(args.cluster_name, stack) diff --git a/cli/src/pcluster/config/cfn_param_types.py b/cli/src/pcluster/config/cfn_param_types.py index 7127323012..4d84b61a69 100644 --- a/cli/src/pcluster/config/cfn_param_types.py +++ b/cli/src/pcluster/config/cfn_param_types.py @@ -581,7 +581,7 @@ def to_file(self, config_parser, write_defaults=False): pass -class MasterAvailabilityZoneCfnParam(AvailabilityZoneCfnParam): +class HeadNodeAvailabilityZoneCfnParam(AvailabilityZoneCfnParam): """ Class to manage master_availability_zone internal attribute. @@ -596,9 +596,9 @@ def from_file(self, config_parser): return self def from_cfn_params(self, cfn_params): - """Initialize the Availability zone by checking the Compute Subnet from cfn.""" - master_subnet_id = get_cfn_param(cfn_params, "MasterSubnetId") - self.value = get_availability_zone_of_subnet(master_subnet_id) + """Initialize the Availability zone by checking the head node subnet from cfn.""" + head_node_subnet_id = get_cfn_param(cfn_params, "MasterSubnetId") + self.value = get_availability_zone_of_subnet(head_node_subnet_id) return self @@ -672,9 +672,9 @@ def to_cfn(self): cfn_params = {self.definition.get("cfn_param_mapping"): "NONE,NONE,NONE,NONE"} cluster_config = self.pcluster_config.get_section(self.section_key) if self.value: - master_instance_type = cluster_config.get_param_value("master_instance_type") - master_cores, disable_master_ht_via_cpu_options = self._get_cfn_params_for_instance_type( - master_instance_type + head_node_instance_type = cluster_config.get_param_value("master_instance_type") + head_node_cores, disable_head_node_ht_via_cpu_options = self._get_cfn_params_for_instance_type( + head_node_instance_type ) if ( @@ -692,7 +692,7 @@ def to_cfn(self): disable_compute_ht_via_cpu_options = False for node_label, cores, instance_type in [ - ("master", master_cores, master_instance_type), + ("master", head_node_cores, head_node_instance_type), ("compute", compute_cores, compute_instance_type), ]: if isinstance(cores, int) and cores < 0: @@ -703,9 +703,9 @@ def to_cfn(self): cfn_params.update( { self.definition.get("cfn_param_mapping"): "{0},{1},{2},{3}".format( - master_cores, + head_node_cores, compute_cores, - str(disable_master_ht_via_cpu_options).lower(), + str(disable_head_node_ht_via_cpu_options).lower(), str(disable_compute_ht_via_cpu_options).lower(), ) } @@ -801,22 +801,22 @@ def get_instance_type_architecture(instance_type): """Compute cluster's 'Architecture' CFN parameter based on its master server instance type.""" if not instance_type: error("Cannot infer architecture without master instance type") - master_inst_supported_architectures = get_supported_architectures_for_instance_type(instance_type) + head_node_supported_architectures = get_supported_architectures_for_instance_type(instance_type) - if not master_inst_supported_architectures: + if not head_node_supported_architectures: error("Unable to get architectures supported by instance type {0}.".format(instance_type)) # If the instance type supports multiple architectures, choose the first one. # TODO: this is currently not an issue because none of the instance types we support more than one of the # architectures we support. If this were ever to change (e.g., we start supporting i386) then we would # probably need to choose based on the subset of the architecutres supported by both the master and # compute instance types. - return master_inst_supported_architectures[0] + return head_node_supported_architectures[0] def refresh(self): """Initialize the private architecture param.""" if self.value: - master_inst_type = self.owner_section.get_param_value("master_instance_type") - architecture = self.get_instance_type_architecture(master_inst_type) + head_node_instance_type = self.owner_section.get_param_value("master_instance_type") + architecture = self.get_instance_type_architecture(head_node_instance_type) self.owner_section.get_param("architecture").value = architecture @@ -1215,28 +1215,28 @@ def to_storage(self, storage_params=None): cfn_items.append(param.get_cfn_value()) if cfn_items[0] == "NONE": - master_mt_valid = False + head_node_mt_valid = False compute_mt_valid = False - master_avail_zone = "fake_az1" + head_node_avail_zone = "fake_az1" compute_avail_zone = "fake_az2" # empty dict or first item is NONE --> set all values to NONE cfn_items = ["NONE"] * len(self.definition.get("params")) else: # add another CFN param that will identify if create or not a Mount Target for the given EFS FS Id - master_avail_zone = self.pcluster_config.get_master_availability_zone() - master_mount_target_id = get_efs_mount_target_id( - efs_fs_id=self.get_param_value("efs_fs_id"), avail_zone=master_avail_zone + head_node_avail_zone = self.pcluster_config.get_head_node_availability_zone() + head_node_mount_target_id = get_efs_mount_target_id( + efs_fs_id=self.get_param_value("efs_fs_id"), avail_zone=head_node_avail_zone ) compute_avail_zone = self.pcluster_config.get_compute_availability_zone() compute_mount_target_id = get_efs_mount_target_id( efs_fs_id=self.get_param_value("efs_fs_id"), avail_zone=compute_avail_zone ) - master_mt_valid = bool(master_mount_target_id) + head_node_mt_valid = bool(head_node_mount_target_id) compute_mt_valid = bool(compute_mount_target_id) - cfn_items.append("Valid" if master_mt_valid else "NONE") + cfn_items.append("Valid" if head_node_mt_valid else "NONE") # Do not create additional compute mount target if compute and master subnet in the same AZ - cfn_items.append("Valid" if compute_mt_valid or (master_avail_zone == compute_avail_zone) else "NONE") + cfn_items.append("Valid" if compute_mt_valid or (head_node_avail_zone == compute_avail_zone) else "NONE") cfn_params[cfn_converter] = ",".join(cfn_items) return storage_params diff --git a/cli/src/pcluster/config/mappings.py b/cli/src/pcluster/config/mappings.py index 480ed8ee88..38e9b64699 100644 --- a/cli/src/pcluster/config/mappings.py +++ b/cli/src/pcluster/config/mappings.py @@ -26,9 +26,9 @@ EFSCfnSection, ExtraJsonCfnParam, FloatCfnParam, + HeadNodeAvailabilityZoneCfnParam, IntCfnParam, MaintainInitialSizeCfnParam, - MasterAvailabilityZoneCfnParam, NetworkInterfacesCountCfnParam, QueueSizeCfnParam, SettingsCfnParam, @@ -288,7 +288,7 @@ }, "master_availability_zone": { # NOTE: this is not exposed as a configuration parameter - "type": MasterAvailabilityZoneCfnParam, + "type": HeadNodeAvailabilityZoneCfnParam, "cfn_param_mapping": "AvailabilityZone", "update_policy": UpdatePolicy.IGNORED, "visibility": Visibility.PRIVATE diff --git a/cli/src/pcluster/config/pcluster_config.py b/cli/src/pcluster/config/pcluster_config.py index c006f6482a..bc122dbf0a 100644 --- a/cli/src/pcluster/config/pcluster_config.py +++ b/cli/src/pcluster/config/pcluster_config.py @@ -494,8 +494,8 @@ def validate(self): # test provided configuration self.__test_configuration() - def get_master_availability_zone(self): - """Get the Availability zone of the Master Subnet.""" + def get_head_node_availability_zone(self): + """Get the Availability zone of the Head Node Subnet.""" return self.get_section("vpc").get_param_value("master_availability_zone") def get_compute_availability_zone(self): diff --git a/cli/src/pcluster/config/update_policy.py b/cli/src/pcluster/config/update_policy.py index 379a4928f0..0d80773f96 100644 --- a/cli/src/pcluster/config/update_policy.py +++ b/cli/src/pcluster/config/update_policy.py @@ -210,12 +210,12 @@ def _check_generated_bucket(change, patch): condition_checker=lambda change, patch: not utils.cluster_has_running_capacity(patch.stack_name), ) -# Update supported only with master node down -UpdatePolicy.MASTER_STOP = UpdatePolicy( +# Update supported only with head node down +UpdatePolicy.HEAD_NODE_STOP = UpdatePolicy( level=20, fail_reason="To perform this update action, the master node must be in a stopped state", action_needed=UpdatePolicy.ACTIONS_NEEDED["pcluster_stop"], - condition_checker=lambda change, patch: utils.get_master_server_state(patch.stack_name) == "stopped", + condition_checker=lambda change, patch: utils.get_head_node_state(patch.stack_name) == "stopped", ) # Expected Behavior: diff --git a/cli/src/pcluster/config/validators.py b/cli/src/pcluster/config/validators.py index acf90106c1..684cd9e79d 100644 --- a/cli/src/pcluster/config/validators.py +++ b/cli/src/pcluster/config/validators.py @@ -118,15 +118,15 @@ def efs_id_validator(param_key, param_value, pcluster_config): errors = [] warnings = [] try: - # Get master availability zone - master_avail_zone = pcluster_config.get_master_availability_zone() - mount_target_id = get_efs_mount_target_id(efs_fs_id=param_value, avail_zone=master_avail_zone) + # Get head node availability zone + head_node_avail_zone = pcluster_config.get_head_node_availability_zone() + head_node_target_id = get_efs_mount_target_id(efs_fs_id=param_value, avail_zone=head_node_avail_zone) # If there is an existing mt in the az, need to check the inbound and outbound rules of the security groups - if mount_target_id: + if head_node_target_id: # Get list of security group IDs of the mount target sg_ids = ( boto3.client("efs") - .describe_mount_target_security_groups(MountTargetId=mount_target_id) + .describe_mount_target_security_groups(MountTargetId=head_node_target_id) .get("SecurityGroups") ) if not _check_in_out_access(sg_ids, port=2049): @@ -134,7 +134,7 @@ def efs_id_validator(param_key, param_value, pcluster_config): "There is an existing Mount Target {0} in the Availability Zone {1} for EFS {2}, " "but it does not have a security group that allows inbound and outbound rules to support NFS. " "Please modify the Mount Target's security group, to allow traffic on port 2049.".format( - mount_target_id, master_avail_zone, param_value + head_node_target_id, head_node_avail_zone, param_value ) ) except ClientError as e: @@ -383,12 +383,12 @@ def dcv_enabled_validator(param_key, param_value, pcluster_config): "Please double check the 'base_os' configuration parameter".format(allowed_oses) ) - master_instance_type = cluster_section.get_param_value("master_instance_type") - if re.search(r"(micro)|(nano)", master_instance_type): + head_node_instance_type = cluster_section.get_param_value("master_instance_type") + if re.search(r"(micro)|(nano)", head_node_instance_type): warnings.append( "The packages required for desktop virtualization in the selected instance type '{0}' " "may cause instability of the master instance. If you want to use NICE DCV it is recommended " - "to use an instance type with at least 1.7 GB of memory.".format(master_instance_type) + "to use an instance type with at least 1.7 GB of memory.".format(head_node_instance_type) ) if pcluster_config.get_section("dcv").get_param_value("access_from") == CIDR_ALL_IPS: @@ -956,7 +956,7 @@ def instances_architecture_compatibility_validator(param_key, param_value, pclus errors = [] warnings = [] - master_architecture = pcluster_config.get_section("cluster").get_param_value("architecture") + head_node_architecture = pcluster_config.get_section("cluster").get_param_value("architecture") # When awsbatch is used as the scheduler, compute_instance_type can contain a CSV list. compute_instance_types = param_value.split(",") for compute_instance_type in compute_instance_types: @@ -970,11 +970,11 @@ def instances_architecture_compatibility_validator(param_key, param_value, pclus ) continue compute_architectures = get_supported_architectures_for_instance_type(compute_instance_type) - if master_architecture not in compute_architectures: + if head_node_architecture not in compute_architectures: errors.append( "The specified compute_instance_type ({0}) supports the architectures {1}, none of which are " "compatible with the architecture supported by the master_instance_type ({2}).".format( - compute_instance_type, compute_architectures, master_architecture + compute_instance_type, compute_architectures, head_node_architecture ) ) diff --git a/cli/src/pcluster/configure/easyconfig.py b/cli/src/pcluster/configure/easyconfig.py index 362d548036..f4b0741906 100644 --- a/cli/src/pcluster/configure/easyconfig.py +++ b/cli/src/pcluster/configure/easyconfig.py @@ -230,8 +230,8 @@ def _create_vpc_parameters(vpc_section, cluster_config): vpc_id = prompt_iterable("VPC ID", vpc_list, default_value=default_vpc) vpc_parameters["vpc_id"] = vpc_id subnet_list = vpc_and_subnets["vpc_subnets"][vpc_id] - qualified_master_subnets = _filter_subnets_offering_instance_type( - subnet_list, cluster_config.master_instance_type + qualified_head_node_subnets = _filter_subnets_offering_instance_type( + subnet_list, cluster_config.head_node_instance_type ) if cluster_config.scheduler != "awsbatch": qualified_compute_subnets = _filter_subnets_offering_instance_type( @@ -241,20 +241,20 @@ def _create_vpc_parameters(vpc_section, cluster_config): # Special case of awsbatch, where compute instance type is not specified qualified_compute_subnets = subnet_list if ( - not qualified_master_subnets + not qualified_head_node_subnets or not qualified_compute_subnets or (prompt("Automate Subnet creation? (y/n)", lambda x: x in ("y", "n"), default_value="y") == "y") ): # Start auto subnets creation in the absence of qualified subnets. # Otherwise, user selects between manual and automate subnets creation - if not qualified_master_subnets or not qualified_compute_subnets: + if not qualified_head_node_subnets or not qualified_compute_subnets: print("There are no qualified subnets. Starting automatic creation of subnets...") vpc_parameters.update( automate_subnet_creation(vpc_id, _choose_network_configuration(cluster_config), min_subnet_size) ) else: vpc_parameters.update( - _ask_for_subnets(subnet_list, vpc_section, qualified_master_subnets, qualified_compute_subnets) + _ask_for_subnets(subnet_list, vpc_section, qualified_head_node_subnets, qualified_compute_subnets) ) return vpc_parameters @@ -264,20 +264,20 @@ def _filter_subnets_offering_instance_type(subnet_list, instance_type): return [subnet_entry for subnet_entry in subnet_list if subnet_entry["availability_zone"] in qualified_azs] -def _ask_for_subnets(subnet_list, vpc_section, qualified_master_subnets, qualified_compute_subnets): - master_subnet_id = _prompt_for_subnet( - vpc_section.get_param_value("master_subnet_id"), subnet_list, qualified_master_subnets, "Master Subnet ID" +def _ask_for_subnets(subnet_list, vpc_section, qualified_head_node_subnets, qualified_compute_subnets): + head_node_subnet_id = _prompt_for_subnet( + vpc_section.get_param_value("master_subnet_id"), subnet_list, qualified_head_node_subnets, "Master Subnet ID" ) compute_subnet_id = _prompt_for_subnet( - vpc_section.get_param_value("compute_subnet_id") or master_subnet_id, + vpc_section.get_param_value("compute_subnet_id") or head_node_subnet_id, subnet_list, qualified_compute_subnets, "Compute Subnet ID", ) - vpc_parameters = {"master_subnet_id": master_subnet_id} + vpc_parameters = {"master_subnet_id": head_node_subnet_id} - if master_subnet_id != compute_subnet_id: + if head_node_subnet_id != compute_subnet_id: vpc_parameters["compute_subnet_id"] = compute_subnet_id return vpc_parameters @@ -286,9 +286,9 @@ def _ask_for_subnets(subnet_list, vpc_section, qualified_master_subnets, qualifi def _choose_network_configuration(cluster_config): if cluster_config.scheduler == "awsbatch": return PublicPrivateNetworkConfig() - azs_for_master_type = get_supported_az_for_one_instance_type(cluster_config.master_instance_type) + azs_for_head_node_type = get_supported_az_for_one_instance_type(cluster_config.head_node_instance_type) azs_for_compute_type = get_supported_az_for_one_instance_type(cluster_config.compute_instance_type) - common_availability_zones = set(azs_for_master_type) & set(azs_for_compute_type) + common_availability_zones = set(azs_for_head_node_type) & set(azs_for_compute_type) if not common_availability_zones: # Automate subnet creation only allows subnets to reside in a single az. @@ -297,7 +297,7 @@ def _choose_network_configuration(cluster_config): "Error: There is no single availability zone offering master and compute in current region.\n" "To create your cluster, make sure you have a subnet for master node in {0}" ", and a subnet for compute nodes in {1}. Then run pcluster configure again" - "and avoid using Automate VPC/Subnet creation.".format(azs_for_master_type, azs_for_compute_type) + "and avoid using Automate VPC/Subnet creation.".format(azs_for_head_node_type, azs_for_compute_type) ) print("Exiting...") sys.exit(1) @@ -366,8 +366,8 @@ def prompt_os(self): ) def prompt_instance_types(self): - """Ask for master_instance_type and compute_instance_type (if necessary).""" - self.master_instance_type = prompt( + """Ask for head_node_instance_type and compute_instance_type (if necessary).""" + self.head_node_instance_type = prompt( "Master instance type", lambda x: _is_instance_type_supported_for_head_node(x) and x in get_supported_instance_types(), default_value=self.cluster_section.get_param_value("master_instance_type"), @@ -399,7 +399,7 @@ def get_scheduler_parameters(self): """Return a dict containing the scheduler dependent parameters.""" scheduler_parameters = { "base_os": self.base_os, - "master_instance_type": self.master_instance_type, + "master_instance_type": self.head_node_instance_type, "compute_instance_type": self.compute_instance_type, self.max_size_name: self.max_cluster_size, self.min_size_name: self.min_cluster_size, @@ -424,4 +424,4 @@ def cache_qualified_az(self): Cache is done inside get get_supported_az_for_instance_types. """ if not self.is_aws_batch: - get_supported_az_for_multi_instance_types([self.master_instance_type, self.compute_instance_type]) + get_supported_az_for_multi_instance_types([self.head_node_instance_type, self.compute_instance_type]) diff --git a/cli/src/pcluster/configure/networking.py b/cli/src/pcluster/configure/networking.py index 18aa484327..dff6834677 100644 --- a/cli/src/pcluster/configure/networking.py +++ b/cli/src/pcluster/configure/networking.py @@ -33,7 +33,7 @@ DEFAULT_AWS_REGION_NAME = "us-east-1" LOGGER = logging.getLogger(__name__) TIMESTAMP = "-{:%Y%m%d%H%M%S}".format(datetime.datetime.utcnow()) -MASTER_SUBNET_IPS = 250 +HEAD_NODE_SUBNET_IPS = 250 if sys.version_info >= (3, 4): ABC = abc.ABC @@ -106,7 +106,7 @@ def get_cfn_parameters(self, vpc_id, internet_gateway_id, public_cidr): def _create(self, vpc_id, vpc_cidr, subnet_cidrs, internet_gateway_id, compute_subnet_size): public_cidr = get_subnet_cidr( - vpc_cidr=vpc_cidr, occupied_cidr=subnet_cidrs, min_subnet_size=compute_subnet_size + MASTER_SUBNET_IPS + vpc_cidr=vpc_cidr, occupied_cidr=subnet_cidrs, min_subnet_size=compute_subnet_size + HEAD_NODE_SUBNET_IPS ) _validate_cidr(public_cidr) parameters = self.get_cfn_parameters(vpc_id, internet_gateway_id, public_cidr) @@ -133,7 +133,7 @@ def get_cfn_parameters(self, vpc_id, internet_gateway_id, public_cidr, private_c return parameters def _create(self, vpc_id, vpc_cidr, subnet_cidrs, internet_gateway_id, compute_subnet_size): # noqa D102 - public_cidr = evaluate_cidr(vpc_cidr=vpc_cidr, occupied_cidrs=subnet_cidrs, target_size=MASTER_SUBNET_IPS) + public_cidr = evaluate_cidr(vpc_cidr=vpc_cidr, occupied_cidrs=subnet_cidrs, target_size=HEAD_NODE_SUBNET_IPS) _validate_cidr(public_cidr) subnet_cidrs.append(public_cidr) private_cidr = get_subnet_cidr( diff --git a/cli/src/pcluster/dcv/connect.py b/cli/src/pcluster/dcv/connect.py index ec999b8509..caa6f99f3d 100644 --- a/cli/src/pcluster/dcv/connect.py +++ b/cli/src/pcluster/dcv/connect.py @@ -16,7 +16,7 @@ from pcluster.config.pcluster_config import PclusterConfig from pcluster.constants import PCLUSTER_ISSUES_LINK from pcluster.dcv.utils import DCV_CONNECT_SCRIPT -from pcluster.utils import error, get_cfn_param, get_master_ip_and_username, get_stack, get_stack_name, retry +from pcluster.utils import error, get_cfn_param, get_head_node_ip_and_username, get_stack, get_stack_name, retry LOGGER = logging.getLogger(__name__) @@ -43,17 +43,17 @@ def dcv_connect(args): # Prepare ssh command to execute in the master instance stack = get_stack(get_stack_name(args.cluster_name)) shared_dir = get_cfn_param(stack.get("Parameters"), "SharedDir") - master_ip, username = get_master_ip_and_username(args.cluster_name) - cmd = 'ssh {CFN_USER}@{MASTER_IP} {KEY} "{REMOTE_COMMAND} {DCV_SHARED_DIR}"'.format( + head_node_ip, username = get_head_node_ip_and_username(args.cluster_name) + cmd = 'ssh {CFN_USER}@{HEAD_NODE_IP} {KEY} "{REMOTE_COMMAND} {DCV_SHARED_DIR}"'.format( CFN_USER=username, - MASTER_IP=master_ip, + HEAD_NODE_IP=head_node_ip, KEY="-i {0}".format(args.key_path) if args.key_path else "", REMOTE_COMMAND=DCV_CONNECT_SCRIPT, DCV_SHARED_DIR=shared_dir, ) try: - url = retry(_retrieve_dcv_session_url, func_args=[cmd, args.cluster_name, master_ip], attempts=4) + url = retry(_retrieve_dcv_session_url, func_args=[cmd, args.cluster_name, head_node_ip], attempts=4) url_message = "Please use the following one-time URL in your browser within 30 seconds:\n{0}".format(url) except DCVConnectionError as e: error( @@ -73,7 +73,7 @@ def dcv_connect(args): LOGGER.info("{0}\n{1}".format(e, url_message)) -def _retrieve_dcv_session_url(ssh_cmd, cluster_name, master_ip): +def _retrieve_dcv_session_url(ssh_cmd, cluster_name, head_node_ip): """Connect by ssh to the master instance, prepare DCV session and return the DCV session URL.""" try: LOGGER.debug("SSH command: {0}".format(ssh_cmd)) @@ -106,5 +106,5 @@ def _retrieve_dcv_session_url(ssh_cmd, cluster_name, master_ip): raise DCVConnectionError(e.output) return "https://{IP}:{PORT}?authToken={TOKEN}#{SESSION_ID}".format( - IP=master_ip, PORT=dcv_server_port, TOKEN=dcv_session_token, SESSION_ID=dcv_session_id + IP=head_node_ip, PORT=dcv_server_port, TOKEN=dcv_session_token, SESSION_ID=dcv_session_id ) diff --git a/cli/src/pcluster/models/hit/hit_cluster_model.py b/cli/src/pcluster/models/hit/hit_cluster_model.py index 5970d68346..5497dd123a 100644 --- a/cli/src/pcluster/models/hit/hit_cluster_model.py +++ b/cli/src/pcluster/models/hit/hit_cluster_model.py @@ -45,51 +45,52 @@ def test_configuration(self, pcluster_config): if not cluster_section or cluster_section.get_param_value("scheduler") == "awsbatch" or not vpc_section: return - master_instance_type = cluster_section.get_param_value("master_instance_type") + head_node_instance_type = cluster_section.get_param_value("master_instance_type") # Retrieve network parameters compute_subnet = vpc_section.get_param_value("compute_subnet_id") - master_subnet = vpc_section.get_param_value("master_subnet_id") + head_node_subnet = vpc_section.get_param_value("master_subnet_id") vpc_security_group = vpc_section.get_param_value("vpc_security_group_id") if not compute_subnet: - compute_subnet = master_subnet + compute_subnet = head_node_subnet security_groups_ids = [] if vpc_security_group: security_groups_ids.append(vpc_security_group) # Initialize CpuOptions disable_hyperthreading = cluster_section.get_param_value("disable_hyperthreading") - master_instance_type_info = InstanceTypeInfo.init_from_instance_type(master_instance_type) + head_node_instance_type_info = InstanceTypeInfo.init_from_instance_type(head_node_instance_type) # Set vcpus according to queue's disable_hyperthreading and instance features - master_vcpus = master_instance_type_info.vcpus_count() + head_node_vcpus = head_node_instance_type_info.vcpus_count() - master_threads_per_core = master_instance_type_info.default_threads_per_core() - master_cpu_options = ( - {"CoreCount": master_vcpus // master_threads_per_core, "ThreadsPerCore": 1} - if disable_hyperthreading and disable_ht_via_cpu_options(master_instance_type, master_threads_per_core) + head_node_threads_per_core = head_node_instance_type_info.default_threads_per_core() + head_node_cpu_options = ( + {"CoreCount": head_node_vcpus // head_node_threads_per_core, "ThreadsPerCore": 1} + if disable_hyperthreading + and disable_ht_via_cpu_options(head_node_instance_type, head_node_threads_per_core) else {} ) try: latest_alinux_ami_id = self._get_latest_alinux_ami_id() - master_network_interfaces = self.build_launch_network_interfaces( + head_node_network_interfaces = self.build_launch_network_interfaces( network_interfaces_count=int(cluster_section.get_param_value("network_interfaces_count")[0]), - use_efa=False, # EFA is not supported on master node + use_efa=False, # EFA is not supported on head node security_group_ids=security_groups_ids, - subnet=master_subnet, + subnet=head_node_subnet, use_public_ips=vpc_section.get_param_value("use_public_ips"), ) - # Test Master Instance Configuration + # Test Head Node Instance Configuration self._ec2_run_instance( pcluster_config, - InstanceType=master_instance_type, + InstanceType=head_node_instance_type, MinCount=1, MaxCount=1, ImageId=latest_alinux_ami_id, - CpuOptions=master_cpu_options, - NetworkInterfaces=master_network_interfaces, + CpuOptions=head_node_cpu_options, + NetworkInterfaces=head_node_network_interfaces, DryRun=True, ) diff --git a/cli/src/pcluster/models/sit/sit_cluster_model.py b/cli/src/pcluster/models/sit/sit_cluster_model.py index 5dac1d231f..b831dc9aed 100644 --- a/cli/src/pcluster/models/sit/sit_cluster_model.py +++ b/cli/src/pcluster/models/sit/sit_cluster_model.py @@ -67,30 +67,31 @@ def test_configuration(self, pcluster_config): ): return - master_instance_type = cluster_section.get_param_value("master_instance_type") + head_node_instance_type = cluster_section.get_param_value("master_instance_type") compute_instance_type = cluster_section.get_param_value("compute_instance_type") # Retrieve network parameters compute_subnet = vpc_section.get_param_value("compute_subnet_id") - master_subnet = vpc_section.get_param_value("master_subnet_id") + head_node_subnet = vpc_section.get_param_value("master_subnet_id") vpc_security_group = vpc_section.get_param_value("vpc_security_group_id") if not compute_subnet: - compute_subnet = master_subnet + compute_subnet = head_node_subnet security_groups_ids = [] if vpc_security_group: security_groups_ids.append(vpc_security_group) # Initialize CpuOptions disable_hyperthreading = cluster_section.get_param_value("disable_hyperthreading") - master_instance_type_info = InstanceTypeInfo.init_from_instance_type(master_instance_type) - master_vcpus = master_instance_type_info.vcpus_count() - master_threads_per_core = master_instance_type_info.default_threads_per_core() + head_node_instance_type_info = InstanceTypeInfo.init_from_instance_type(head_node_instance_type) + head_node_vcpus = head_node_instance_type_info.vcpus_count() + head_node_threads_per_core = head_node_instance_type_info.default_threads_per_core() compute_instance_type_info = InstanceTypeInfo.init_from_instance_type(compute_instance_type) compute_vcpus = compute_instance_type_info.vcpus_count() compute_threads_per_core = compute_instance_type_info.default_threads_per_core() - master_cpu_options = ( - {"CoreCount": master_vcpus // master_threads_per_core, "ThreadsPerCore": 1} - if disable_hyperthreading and disable_ht_via_cpu_options(master_instance_type, master_threads_per_core) + head_node_cpu_options = ( + {"CoreCount": head_node_vcpus // head_node_threads_per_core, "ThreadsPerCore": 1} + if disable_hyperthreading + and disable_ht_via_cpu_options(head_node_instance_type, head_node_threads_per_core) else {} ) compute_cpu_options = ( @@ -102,7 +103,7 @@ def test_configuration(self, pcluster_config): # Initialize Placement Group Logic placement_group = cluster_section.get_param_value("placement_group") placement = cluster_section.get_param_value("placement") - master_placement_group = ( + head_node_placement_group = ( {"GroupName": placement_group} if placement_group not in [None, "NONE", "DYNAMIC"] and placement == "cluster" else {} @@ -114,24 +115,24 @@ def test_configuration(self, pcluster_config): try: latest_alinux_ami_id = self._get_latest_alinux_ami_id() - master_network_interfaces = self.build_launch_network_interfaces( + head_node_network_interfaces = self.build_launch_network_interfaces( network_interfaces_count=int(cluster_section.get_param_value("network_interfaces_count")[0]), use_efa=False, # EFA is not supported on master node security_group_ids=security_groups_ids, - subnet=master_subnet, + subnet=head_node_subnet, use_public_ips=vpc_section.get_param_value("use_public_ips"), ) - # Test Master Instance Configuration + # Test head node configuration self._ec2_run_instance( pcluster_config, - InstanceType=master_instance_type, + InstanceType=head_node_instance_type, MinCount=1, MaxCount=1, ImageId=latest_alinux_ami_id, - CpuOptions=master_cpu_options, - NetworkInterfaces=master_network_interfaces, - Placement=master_placement_group, + CpuOptions=head_node_cpu_options, + NetworkInterfaces=head_node_network_interfaces, + Placement=head_node_placement_group, DryRun=True, ) diff --git a/cli/src/pcluster/resources/batch/docker/scripts/mount_nfs.sh b/cli/src/pcluster/resources/batch/docker/scripts/mount_nfs.sh index 2f6f30e12d..4a350500ac 100755 --- a/cli/src/pcluster/resources/batch/docker/scripts/mount_nfs.sh +++ b/cli/src/pcluster/resources/batch/docker/scripts/mount_nfs.sh @@ -13,7 +13,7 @@ # ANY KIND, express or implied. See the License for the specific # language governing permissions and limitations under the License. -# Usage: mount_filesystem.sh master_ip shared_dir +# Usage: mount_filesystem.sh head_node_ip shared_dir error_exit_usage() { echo "Error executing script: $1" @@ -28,20 +28,20 @@ error_exit() { usage() { cat < -master_ip: ip address of the main node -shared_dir: directory from master to be shared. If directory doesn't exist on compute, will be created +mount_nfs +head_node_ip: ip address of the main node +shared_dir: directory from head node to be shared. If directory doesn't exist on compute, will be created ENDUSAGE } # Check that the arguments are valid check_arguments_valid(){ - if [ -z "${master_ip}" ]; then - error_exit_usage "Master IP is a required argument" + if [ -z "${head_node_ip}" ]; then + error_exit_usage "Head Node IP is a required argument" fi if [ -z "${shared_dir}" ]; then @@ -58,22 +58,22 @@ mount_nfs() { fi mkdir -p ${shared_dir} - error_message=$(mount -t nfs -o hard,intr,noatime,_netdev "${master_ip}":"${shared_dir}" "${shared_dir}" 2>&1) + error_message=$(mount -t nfs -o hard,intr,noatime,_netdev "${head_node_ip}":"${shared_dir}" "${shared_dir}" 2>&1) if [[ $? -ne 0 ]]; then - error_exit "Failed to mount nfs volume from ${master_ip}:${shared_dir} with error_message: ${error_message}" + error_exit "Failed to mount nfs volume from ${head_node_ip}:${shared_dir} with error_message: ${error_message}" fi # Check that the filesystem is mounted as appropriate - mount_line=$(mount | grep "${master_ip}:${shared_dir}") + mount_line=$(mount | grep "${head_node_ip}:${shared_dir}") if [[ -z "${mount_line}" ]]; then - error_exit "mount succeeded but nfs volume from ${master_ip}:${shared_dir} was not mounted as expected" + error_exit "mount succeeded but nfs volume from ${head_node_ip}:${shared_dir} was not mounted as expected" fi } # main function main() { - master_ip=${1} + head_node_ip=${1} shared_dir=${2} if [[ "${shared_dir:0:1}" != '/' ]]; then shared_dir="/${shared_dir}" diff --git a/cli/src/pcluster/utils.py b/cli/src/pcluster/utils.py index c479ad500d..5f8704c219 100644 --- a/cli/src/pcluster/utils.py +++ b/cli/src/pcluster/utils.py @@ -840,7 +840,7 @@ def describe_cluster_instances(stack_name, node_type): return instances -def _get_master_server_ip(stack_name): +def _get_head_node_ip(stack_name): """ Get the IP Address of the MasterServer. @@ -851,17 +851,17 @@ def _get_master_server_ip(stack_name): instances = describe_cluster_instances(stack_name, node_type=NodeType.master) if not instances: error("MasterServer not running. Can't SSH") - master_instance = instances[0] - ip_address = master_instance.get("PublicIpAddress") + head_node = instances[0] + ip_address = head_node.get("PublicIpAddress") if ip_address is None: - ip_address = master_instance.get("PrivateIpAddress") - state = master_instance.get("State").get("Name") + ip_address = head_node.get("PrivateIpAddress") + state = head_node.get("State").get("Name") if state != "running" or ip_address is None: error("MasterServer: {0}\nCannot get ip address.".format(state.upper())) return ip_address -def get_master_ip_and_username(cluster_name): +def get_head_node_ip_and_username(cluster_name): cfn = boto3.client("cloudformation") try: stack_name = get_stack_name(cluster_name) @@ -872,13 +872,13 @@ def get_master_ip_and_username(cluster_name): if stack_status in ["DELETE_COMPLETE", "DELETE_IN_PROGRESS"]: error("Unable to retrieve master_ip and username for a stack in the status: {0}".format(stack_status)) else: - master_ip = _get_master_server_ip(stack_name) + head_node_ip = _get_head_node_ip(stack_name) template = cfn.get_template(StackName=stack_name) mappings = template.get("TemplateBody").get("Mappings").get("OSFeatures") base_os = get_cfn_param(stack_result.get("Parameters"), "BaseOS") username = mappings.get(base_os).get("User") - if not master_ip: + if not head_node_ip: error("Failed to get cluster {0} ip.".format(cluster_name)) if not username: error("Failed to get cluster {0} username.".format(cluster_name)) @@ -886,10 +886,10 @@ def get_master_ip_and_username(cluster_name): except ClientError as e: error(e.response.get("Error").get("Message")) - return master_ip, username + return head_node_ip, username -def get_master_server_state(stack_name): +def get_head_node_state(stack_name): """ Get the State of the MasterServer. diff --git a/cli/tests/pcluster/config/test_json_param_types.py b/cli/tests/pcluster/config/test_json_param_types.py index efa90eb1cd..3aa3057513 100644 --- a/cli/tests/pcluster/config/test_json_param_types.py +++ b/cli/tests/pcluster/config/test_json_param_types.py @@ -147,7 +147,7 @@ def test_config_to_json(capsys, boto3_stubber, test_datadir, pcluster_config_rea expected_json_params = _prepare_json_config(queues, test_datadir) # Mock expected boto3 calls - _mock_boto3(boto3_stubber, expected_json_params, master_instance_type="c4.xlarge") + _mock_boto3(boto3_stubber, expected_json_params, head_node_instance_type="c4.xlarge") # Load config from created config file dst_config_file = pcluster_config_reader(dst_config_file, queue_settings=queue_settings) @@ -179,7 +179,7 @@ def mock_get_avail_zone(subnet_id): expected_json_params = _prepare_json_config(queues, test_datadir) # Mock expected boto3 calls - _mock_boto3(boto3_stubber, expected_json_params, master_instance_type="t2.micro") + _mock_boto3(boto3_stubber, expected_json_params, head_node_instance_type="t2.micro") pcluster_config = get_mocked_pcluster_config(mocker, auto_refresh=False) cluster_section = CfnSection(CLUSTER_HIT, pcluster_config, section_label="default") @@ -244,14 +244,14 @@ def _prepare_json_config(queues, test_datadir): return expected_json_params -def _mock_boto3(boto3_stubber, expected_json_params, master_instance_type=None): +def _mock_boto3(boto3_stubber, expected_json_params, head_node_instance_type=None): """Mock the boto3 client based on the expected json configuration.""" expected_json_queue_settings = expected_json_params["cluster"].get("queue_settings", {}) mocked_requests = [] instance_types = [] - # One describe_instance_type for the Master node - if master_instance_type: - instance_types.append(master_instance_type) + # One describe_instance_type for the Head node + if head_node_instance_type: + instance_types.append(head_node_instance_type) # One describe_instance_type per compute resource for _, queue in expected_json_queue_settings.items(): diff --git a/cli/tests/pcluster/config/test_section_efs.py b/cli/tests/pcluster/config/test_section_efs.py index d90e71f42d..931e355ab8 100644 --- a/cli/tests/pcluster/config/test_section_efs.py +++ b/cli/tests/pcluster/config/test_section_efs.py @@ -164,7 +164,8 @@ def test_efs_param_from_file(mocker, param_key, param_value, expected_value, exp def test_efs_section_to_cfn(mocker, section_dict, expected_cfn_params): mocker.patch("pcluster.config.cfn_param_types.get_efs_mount_target_id", return_value="valid_mount_target_id") mocker.patch( - "pcluster.config.pcluster_config.PclusterConfig.get_master_availability_zone", return_value="mocked_avail_zone" + "pcluster.config.pcluster_config.PclusterConfig.get_head_node_availability_zone", + return_value="mocked_avail_zone", ) utils.assert_section_to_cfn(mocker, EFS, section_dict, expected_cfn_params) diff --git a/cli/tests/pcluster/config/test_validators.py b/cli/tests/pcluster/config/test_validators.py index 32b7dc81d9..b457bda99e 100644 --- a/cli/tests/pcluster/config/test_validators.py +++ b/cli/tests/pcluster/config/test_validators.py @@ -2211,7 +2211,7 @@ def test_disable_hyperthreading_architecture_validator(mocker, disable_hyperthre @pytest.mark.parametrize( - "master_architecture, compute_architecture, compute_instance_type, expected_message", + "head_node_architecture, compute_architecture, compute_instance_type, expected_message", [ # Single compute_instance_type ("x86_64", "x86_64", "c5.xlarge", []), @@ -2244,7 +2244,7 @@ def test_disable_hyperthreading_architecture_validator(mocker, disable_hyperthre ], ) def test_instances_architecture_compatibility_validator( - mocker, caplog, master_architecture, compute_architecture, compute_instance_type, expected_message + mocker, caplog, head_node_architecture, compute_architecture, compute_instance_type, expected_message ): def internal_is_instance_type(itype): return "." in itype or itype == "optimal" @@ -2258,7 +2258,7 @@ def internal_is_instance_type(itype): logger_patch = mocker.patch.object(LOGFILE_LOGGER, "debug") run_architecture_validator_test( mocker, - {"cluster": {"architecture": master_architecture}}, + {"cluster": {"architecture": head_node_architecture}}, "cluster", "architecture", "compute_instance_type", diff --git a/cli/tests/pcluster/config/utils.py b/cli/tests/pcluster/config/utils.py index 9b3fbd9ee9..e1bdbacb77 100644 --- a/cli/tests/pcluster/config/utils.py +++ b/cli/tests/pcluster/config/utils.py @@ -103,10 +103,10 @@ def assert_param_from_file( def get_mock_pcluster_config_patches(scheduler, extra_patches=None): """Return mocks for a set of functions that should be mocked by default because they access the network.""" architectures = ["x86_64"] - master_instances = ["t2.micro", "t2.large", "c4.xlarge", "p4d.24xlarge"] - compute_instances = ["t2.micro", "t2.large", "t2", "optimal"] if scheduler == "awsbatch" else master_instances + head_node_instances = ["t2.micro", "t2.large", "c4.xlarge", "p4d.24xlarge"] + compute_instances = ["t2.micro", "t2.large", "t2", "optimal"] if scheduler == "awsbatch" else head_node_instances patches = { - "pcluster.config.validators.get_supported_instance_types": master_instances, + "pcluster.config.validators.get_supported_instance_types": head_node_instances, "pcluster.config.validators.get_supported_compute_instance_types": compute_instances, "pcluster.config.validators.get_supported_architectures_for_instance_type": architectures, "pcluster.config.cfn_param_types.get_availability_zone_of_subnet": "mocked_avail_zone", diff --git a/cli/tests/pcluster/configure/test_pcluster_configure.py b/cli/tests/pcluster/configure/test_pcluster_configure.py index 367f4fd2ce..5da9455901 100644 --- a/cli/tests/pcluster/configure/test_pcluster_configure.py +++ b/cli/tests/pcluster/configure/test_pcluster_configure.py @@ -345,15 +345,15 @@ def __init__(self, aws_region_name, key, scheduler): self.input_list = [] if aws_region_name is None else [aws_region_name] self.input_list.extend([key, scheduler]) - def add_first_flow(self, op_sys, min_size, max_size, master_instance, compute_instance): + def add_first_flow(self, op_sys, min_size, max_size, head_node_instance, compute_instance): if self.is_not_aws_batch: self.input_list.append(op_sys) - self.input_list.extend([min_size, max_size, master_instance]) + self.input_list.extend([min_size, max_size, head_node_instance]) if self.is_not_aws_batch: self.input_list.append(compute_instance) - def add_no_automation_no_empty_vpc(self, vpc_id, master_id, compute_id): - self.input_list.extend(["n", vpc_id, "n", master_id, compute_id]) + def add_no_automation_no_empty_vpc(self, vpc_id, head_node_id, compute_id): + self.input_list.extend(["n", vpc_id, "n", head_node_id, compute_id]) def add_sub_automation(self, vpc_id, network_configuration, vpc_has_subnets=True): self.input_list.extend(["n", vpc_id]) @@ -416,7 +416,7 @@ def _run_input_test_with_config( output, capsys, with_input=False, - master_instance="c5.xlarge", + head_node_instance="c5.xlarge", compute_instance="g3.8xlarge", ): if with_input: @@ -425,16 +425,16 @@ def _run_input_test_with_config( op_sys="ubuntu1604", min_size="7", max_size="18", - master_instance=master_instance, + head_node_instance=head_node_instance, compute_instance=compute_instance, ) input_composer.add_no_automation_no_empty_vpc( - vpc_id="vpc-34567891", master_id="subnet-34567891", compute_id="subnet-45678912" + vpc_id="vpc-34567891", head_node_id="subnet-34567891", compute_id="subnet-45678912" ) else: input_composer = ComposeInput(aws_region_name="", key="", scheduler="") - input_composer.add_first_flow(op_sys="", min_size="", max_size="", master_instance="", compute_instance="") - input_composer.add_no_automation_no_empty_vpc(vpc_id="", master_id="", compute_id="") + input_composer.add_first_flow(op_sys="", min_size="", max_size="", head_node_instance="", compute_instance="") + input_composer.add_no_automation_no_empty_vpc(vpc_id="", head_node_id="", compute_id="") input_composer.mock_input(mocker) @@ -447,10 +447,10 @@ def test_no_automation_no_awsbatch_no_errors(mocker, capsys, test_datadir): MockHandler(mocker) input_composer = ComposeInput(aws_region_name="eu-west-1", key="key1", scheduler="torque") input_composer.add_first_flow( - op_sys="alinux", min_size="13", max_size="14", master_instance="t2.nano", compute_instance="t2.micro" + op_sys="alinux", min_size="13", max_size="14", head_node_instance="t2.nano", compute_instance="t2.micro" ) input_composer.add_no_automation_no_empty_vpc( - vpc_id="vpc-12345678", master_id="subnet-12345678", compute_id="subnet-23456789" + vpc_id="vpc-12345678", head_node_id="subnet-12345678", compute_id="subnet-23456789" ) input_composer.mock_input(mocker) @@ -483,10 +483,10 @@ def test_with_region_arg_with_config_file(mocker, capsys, test_datadir): input_composer = ComposeInput(aws_region_name=None, key="key1", scheduler="torque") input_composer.add_first_flow( - op_sys="alinux", min_size="13", max_size="14", master_instance="t2.nano", compute_instance="t2.micro" + op_sys="alinux", min_size="13", max_size="14", head_node_instance="t2.nano", compute_instance="t2.micro" ) input_composer.add_no_automation_no_empty_vpc( - vpc_id="vpc-12345678", master_id="subnet-12345678", compute_id="subnet-23456789" + vpc_id="vpc-12345678", head_node_id="subnet-12345678", compute_id="subnet-23456789" ) input_composer.mock_input(mocker) os.environ["AWS_DEFAULT_REGION"] = "env_region_name_to_be_overwritten" @@ -524,7 +524,7 @@ def test_unexisting_instance_type(mocker, capsys, test_datadir): output, capsys, with_input=True, - master_instance="m6g.xlarge", + head_node_instance="m6g.xlarge", compute_instance="m6g.xlarge", ) @@ -533,7 +533,7 @@ def test_no_available_no_input_no_automation_no_errors_with_config_file(mocker, """ Testing easy config with user hitting return on all prompts. - Mocking the case where parameters: aws_region_name, key_name, vpc_id, compute_subnet_id, master_subnet_id. + Mocking the case where parameters: aws_region_name, key_name, vpc_id, compute_subnet_id, head_node_subnet_id. Are not found in available list under new partition/region/vpc configuration. After running easy config, the old original_config_file should be the same as pcluster.config.ini """ @@ -565,7 +565,7 @@ def test_with_input_no_automation_no_errors_with_config_file(mocker, capsys, tes output, capsys, with_input=True, - master_instance="m6g.xlarge", + head_node_instance="m6g.xlarge", compute_instance="m6g.xlarge", ) @@ -577,10 +577,10 @@ def test_no_automation_yes_awsbatch_no_errors(mocker, capsys, test_datadir): input_composer = ComposeInput(aws_region_name="eu-west-1", key="key1", scheduler="awsbatch") input_composer.add_first_flow( - op_sys=None, min_size="13", max_size="14", master_instance="t2.nano", compute_instance=None + op_sys=None, min_size="13", max_size="14", head_node_instance="t2.nano", compute_instance=None ) input_composer.add_no_automation_no_empty_vpc( - vpc_id="vpc-12345678", master_id="subnet-12345678", compute_id="subnet-23456789" + vpc_id="vpc-12345678", head_node_id="subnet-12345678", compute_id="subnet-23456789" ) input_composer.mock_input(mocker) @@ -595,7 +595,7 @@ def test_subnet_automation_no_awsbatch_no_errors_empty_vpc(mocker, capsys, test_ input_composer = ComposeInput(aws_region_name="eu-west-1", key="key1", scheduler="sge") input_composer.add_first_flow( - op_sys="centos7", min_size="13", max_size="14", master_instance="t2.nano", compute_instance="t2.micro" + op_sys="centos7", min_size="13", max_size="14", head_node_instance="t2.nano", compute_instance="t2.micro" ) input_composer.add_sub_automation( vpc_id="vpc-23456789", network_configuration=PUBLIC_PRIVATE_CONFIGURATION, vpc_has_subnets=False @@ -613,7 +613,7 @@ def test_subnet_automation_no_awsbatch_no_errors(mocker, capsys, test_datadir): input_composer = ComposeInput(aws_region_name="eu-west-1", key="key1", scheduler="sge") input_composer.add_first_flow( - op_sys="centos7", min_size="13", max_size="14", master_instance="t2.nano", compute_instance="t2.micro" + op_sys="centos7", min_size="13", max_size="14", head_node_instance="t2.nano", compute_instance="t2.micro" ) input_composer.add_sub_automation( vpc_id="vpc-12345678", network_configuration=PUBLIC_PRIVATE_CONFIGURATION, vpc_has_subnets=True @@ -632,7 +632,7 @@ def test_subnet_automation_no_awsbatch_no_errors_with_config_file(mocker, capsys input_composer = ComposeInput(aws_region_name="eu-west-1", key="key1", scheduler="sge") input_composer.add_first_flow( - op_sys="centos7", min_size="13", max_size="14", master_instance="t2.nano", compute_instance="t2.micro" + op_sys="centos7", min_size="13", max_size="14", head_node_instance="t2.nano", compute_instance="t2.micro" ) input_composer.add_sub_automation( vpc_id="vpc-12345678", network_configuration=PUBLIC_PRIVATE_CONFIGURATION, vpc_has_subnets=True @@ -650,7 +650,7 @@ def test_vpc_automation_no_awsbatch_no_errors(mocker, capsys, test_datadir): input_composer = ComposeInput(aws_region_name="eu-west-1", key="key1", scheduler="sge") input_composer.add_first_flow( - op_sys="centos7", min_size="13", max_size="14", master_instance="t2.nano", compute_instance="t2.micro" + op_sys="centos7", min_size="13", max_size="14", head_node_instance="t2.nano", compute_instance="t2.micro" ) input_composer.add_vpc_sub_automation(network_configuration=PUBLIC_PRIVATE_CONFIGURATION) input_composer.mock_input(mocker) @@ -666,7 +666,7 @@ def test_vpc_automation_yes_awsbatch_no_errors(mocker, capsys, test_datadir): input_composer = ComposeInput(aws_region_name="eu-west-1", key="key1", scheduler="awsbatch") input_composer.add_first_flow( - op_sys=None, min_size="13", max_size="14", master_instance="t2.nano", compute_instance=None + op_sys=None, min_size="13", max_size="14", head_node_instance="t2.nano", compute_instance=None ) input_composer.add_vpc_sub_automation(network_configuration=PUBLIC_PRIVATE_CONFIGURATION) input_composer.mock_input(mocker) @@ -685,7 +685,7 @@ def test_vpc_automation_invalid_vpc_block(mocker, capsys, test_datadir): input_composer = ComposeInput(aws_region_name="eu-west-1", key="key1", scheduler="awsbatch") input_composer.add_first_flow( - op_sys=None, min_size="13", max_size="14", master_instance="t2.nano", compute_instance=None + op_sys=None, min_size="13", max_size="14", head_node_instance="t2.nano", compute_instance=None ) input_composer.add_vpc_sub_automation(network_configuration=PUBLIC_PRIVATE_CONFIGURATION) input_composer.mock_input(mocker) @@ -702,7 +702,7 @@ def test_subnet_automation_yes_awsbatch_invalid_vpc(mocker, capsys, test_datadir input_composer = ComposeInput(aws_region_name="eu-west-1", key="key1", scheduler="awsbatch") input_composer.add_first_flow( - op_sys=None, min_size="13", max_size="14", master_instance="t2.nano", compute_instance=None + op_sys=None, min_size="13", max_size="14", head_node_instance="t2.nano", compute_instance=None ) input_composer.add_sub_automation(vpc_id="vpc-12345678", network_configuration=PUBLIC_PRIVATE_CONFIGURATION) input_composer.mock_input(mocker) @@ -718,7 +718,7 @@ def test_vpc_automation_no_vpc_in_region(mocker, capsys, test_datadir): input_composer = ComposeInput(aws_region_name="eu-west-1", key="key1", scheduler="slurm") input_composer.add_first_flow( - op_sys="centos7", min_size="13", max_size="14", master_instance="t2.nano", compute_instance="t2.micro" + op_sys="centos7", min_size="13", max_size="14", head_node_instance="t2.nano", compute_instance="t2.micro" ) input_composer.add_vpc_sub_automation_empty_region(network_configuration=PUBLIC_PRIVATE_CONFIGURATION) input_composer.mock_input(mocker) @@ -734,7 +734,7 @@ def test_vpc_automation_no_vpc_in_region_public(mocker, capsys, test_datadir): input_composer = ComposeInput(aws_region_name="eu-west-1", key="key1", scheduler="slurm") input_composer.add_first_flow( - op_sys="centos7", min_size="13", max_size="14", master_instance="t2.nano", compute_instance="t2.micro" + op_sys="centos7", min_size="13", max_size="14", head_node_instance="t2.nano", compute_instance="t2.micro" ) input_composer.add_vpc_sub_automation_empty_region(network_configuration="2") input_composer.mock_input(mocker) @@ -761,7 +761,7 @@ def test_bad_config_file(mocker, capsys, test_datadir): input_composer = ComposeInput(aws_region_name="eu-west-1", key="key1", scheduler="sge") input_composer.add_first_flow( - op_sys="centos7", min_size="13", max_size="14", master_instance="t2.nano", compute_instance="t2.micro" + op_sys="centos7", min_size="13", max_size="14", head_node_instance="t2.nano", compute_instance="t2.micro" ) input_composer.add_sub_automation( vpc_id="vpc-12345678", network_configuration=PUBLIC_PRIVATE_CONFIGURATION, vpc_has_subnets=True @@ -778,18 +778,18 @@ def general_wrapper_for_prompt_testing( op_sys="centos7", min_size="0", max_size="10", - master_instance="t2.nano", + head_node_instance="t2.nano", compute_instance="t2.micro", key="key1", vpc_id="vpc-12345678", - master_id="subnet-12345678", + head_node_id="subnet-12345678", compute_id="subnet-23456789", ): path = os.path.join(tempfile.gettempdir(), "test_pcluster_configure") MockHandler(mocker) input_composer = ComposeInput(aws_region_name=region, key=key, scheduler=scheduler) - input_composer.add_first_flow(op_sys, min_size, max_size, master_instance, compute_instance) - input_composer.add_no_automation_no_empty_vpc(vpc_id, master_id, compute_id) + input_composer.add_first_flow(op_sys, min_size, max_size, head_node_instance, compute_instance) + input_composer.add_no_automation_no_empty_vpc(vpc_id, head_node_id, compute_id) input_composer.mock_input(mocker) _run_configuration(mocker, path) @@ -808,7 +808,7 @@ def test_vpc_automation_with_no_single_qualified_az(mocker, capsys, test_datadir input_composer = ComposeInput(aws_region_name="eu-west-1", key="key1", scheduler="sge") input_composer.add_first_flow( - op_sys="centos7", min_size="13", max_size="14", master_instance="t2.nano", compute_instance="t2.micro" + op_sys="centos7", min_size="13", max_size="14", head_node_instance="t2.nano", compute_instance="t2.micro" ) input_composer.add_vpc_sub_automation(network_configuration=PUBLIC_PRIVATE_CONFIGURATION) input_composer.mock_input(mocker) @@ -878,30 +878,30 @@ def test_invalid_vpc(mocker, vpc_id): @pytest.mark.parametrize( - "vpc_id, master_id, compute_id", + "vpc_id, head_node_id, compute_id", [ ("vpc-12345678", "subnet-34567891", "subnet-45678912"), ("vpc-23456789", "subnet-34567891", "subnet-45678912"), ("vpc-34567891", "subnet-12345678", "subnet-23456789"), ], ) -def test_invalid_subnet(mocker, vpc_id, master_id, compute_id): +def test_invalid_subnet(mocker, vpc_id, head_node_id, compute_id): with pytest.raises(StopIteration): assert_that( - general_wrapper_for_prompt_testing(mocker, vpc_id=vpc_id, master_id=master_id, compute_id=compute_id) + general_wrapper_for_prompt_testing(mocker, vpc_id=vpc_id, head_node_id=head_node_id, compute_id=compute_id) ).is_true() @pytest.mark.parametrize( - "vpc_id, master_id, compute_id", + "vpc_id, head_node_id, compute_id", [("vpc-12345678", "subnet-12345678", "subnet-23456789"), ("vpc-34567891", "subnet-45678912", "subnet-45678912")], ) -def test_valid_subnet(mocker, vpc_id, master_id, compute_id): +def test_valid_subnet(mocker, vpc_id, head_node_id, compute_id): # valid subnets assert_that( - general_wrapper_for_prompt_testing(mocker, vpc_id=vpc_id, master_id=master_id, compute_id=compute_id) + general_wrapper_for_prompt_testing(mocker, vpc_id=vpc_id, head_node_id=head_node_id, compute_id=compute_id) ).is_true() @@ -917,7 +917,7 @@ def test_hit_config_file(mocker, capsys, test_datadir): def test_invalid_p4d_head_node_type(mocker): with pytest.raises(StopIteration): - assert_that(general_wrapper_for_prompt_testing(mocker, master_instance="p4d.24xlarge")).is_true() + assert_that(general_wrapper_for_prompt_testing(mocker, head_node_instance="p4d.24xlarge")).is_true() def test_valid_p4d_compute_node_type(mocker): diff --git a/cli/tests/pcluster/test_utils.py b/cli/tests/pcluster/test_utils.py index ac6d1191a4..6195489b77 100644 --- a/cli/tests/pcluster/test_utils.py +++ b/cli/tests/pcluster/test_utils.py @@ -527,9 +527,9 @@ def test_get_supported_architectures_for_instance_type(mocker, instance_type, su @pytest.mark.parametrize( "node_type, expected_fallback, expected_response, expected_instances", [ - (utils.NodeType.master, False, {"Reservations": [{"Groups": [], "Instances": [{}]}]}, 1), - (utils.NodeType.master, True, {"Reservations": [{"Groups": [], "Instances": [{}]}]}, 1), - (utils.NodeType.master, True, {"Reservations": []}, 0), + (utils.NodeType.head_node, False, {"Reservations": [{"Groups": [], "Instances": [{}]}]}, 1), + (utils.NodeType.head_node, True, {"Reservations": [{"Groups": [], "Instances": [{}]}]}, 1), + (utils.NodeType.head_node, True, {"Reservations": []}, 0), (utils.NodeType.compute, False, {"Reservations": [{"Groups": [], "Instances": [{}, {}, {}]}]}, 3), (utils.NodeType.compute, True, {"Reservations": [{"Groups": [], "Instances": [{}, {}]}]}, 2), (utils.NodeType.compute, True, {"Reservations": []}, 0), @@ -570,7 +570,7 @@ def test_describe_cluster_instances(boto3_stubber, node_type, expected_fallback, @pytest.mark.parametrize( - "master_instance, expected_ip, error", + "head_node_instance, expected_ip, error", [ ( { @@ -594,17 +594,17 @@ def test_describe_cluster_instances(boto3_stubber, node_type, expected_fallback, ], ids=["public_ip", "private_ip", "stopped"], ) -def test_get_master_server_ips(mocker, master_instance, expected_ip, error): +def test_get_head_node_ips(mocker, head_node_instance, expected_ip, error): describe_cluster_instances_mock = mocker.patch( - "pcluster.utils.describe_cluster_instances", return_value=[master_instance] + "pcluster.utils.describe_cluster_instances", return_value=[head_node_instance] ) if error: with pytest.raises(SystemExit, match=error): - utils._get_master_server_ip("stack-name") + utils._get_head_node_ip("stack-name") else: - assert_that(utils._get_master_server_ip("stack-name")).is_equal_to(expected_ip) - describe_cluster_instances_mock.assert_called_with("stack-name", node_type=utils.NodeType.master) + assert_that(utils._get_head_node_ip("stack-name")).is_equal_to(expected_ip) + describe_cluster_instances_mock.assert_called_with("stack-name", node_type=utils.NodeType.head_node) @pytest.mark.parametrize( diff --git a/cloudformation/aws-parallelcluster.cfn.json b/cloudformation/aws-parallelcluster.cfn.json index b287b0315a..332e6f7f2b 100644 --- a/cloudformation/aws-parallelcluster.cfn.json +++ b/cloudformation/aws-parallelcluster.cfn.json @@ -332,7 +332,7 @@ "Default": "NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE" }, "EFSOptions": { - "Description": "Comma separated list of EFS related options, 9 parameters in total, [shared_dir,efs_fs_id,performance_mode,efs_kms_key_id,provisioned_throughput,encrypted,throughput_mode,exists_valid_master_mt,exists_valid_compute_mt]", + "Description": "Comma separated list of EFS related options, 9 parameters in total, [shared_dir,efs_fs_id,performance_mode,efs_kms_key_id,provisioned_throughput,encrypted,throughput_mode,exists_valid_head_node_mt,exists_valid_compute_mt]", "Type": "String", "Default": "NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE" }, diff --git a/tests/integration-tests/clusters_factory.py b/tests/integration-tests/clusters_factory.py index 0015224d5c..52e4b455fa 100644 --- a/tests/integration-tests/clusters_factory.py +++ b/tests/integration-tests/clusters_factory.py @@ -160,7 +160,7 @@ def region(self): return self.config.get("aws", "aws_region_name", fallback="us-east-1") @property - def master_ip(self): + def head_node_ip(self): """Return the public ip of the cluster master node.""" if "MasterPublicIP" in self.cfn_outputs: return self.cfn_outputs["MasterPublicIP"] diff --git a/tests/integration-tests/remote_command_executor.py b/tests/integration-tests/remote_command_executor.py index 73cec42784..686db3adfb 100644 --- a/tests/integration-tests/remote_command_executor.py +++ b/tests/integration-tests/remote_command_executor.py @@ -31,12 +31,12 @@ def __init__(self, cluster, username=None): if not username: username = get_username_for_os(cluster.os) self.__connection = Connection( - host=cluster.master_ip, + host=cluster.head_node_ip, user=username, forward_agent=False, connect_kwargs={"key_filename": [cluster.ssh_key]}, ) - self.__user_at_hostname = "{0}@{1}".format(username, cluster.master_ip) + self.__user_at_hostname = "{0}@{1}".format(username, cluster.head_node_ip) def __del__(self): try: diff --git a/tests/integration-tests/tests/cloudwatch_logging/test_cloudwatch_logging.py b/tests/integration-tests/tests/cloudwatch_logging/test_cloudwatch_logging.py index 59cf9aed93..af8f3727d6 100644 --- a/tests/integration-tests/tests/cloudwatch_logging/test_cloudwatch_logging.py +++ b/tests/integration-tests/tests/cloudwatch_logging/test_cloudwatch_logging.py @@ -29,9 +29,9 @@ DEFAULT_SHARED_DIR = "/shared" DEFAULT_RETENTION_DAYS = 14 NODE_CONFIG_PATH = "/etc/chef/dna.json" -MASTER_NODE_ROLE_NAME = "MasterServer" +HEAD_NODE_ROLE_NAME = "MasterServer" COMPUTE_NODE_ROLE_NAME = "ComputeFleet" -NODE_ROLE_NAMES = {MASTER_NODE_ROLE_NAME, COMPUTE_NODE_ROLE_NAME} +NODE_ROLE_NAMES = {HEAD_NODE_ROLE_NAME, COMPUTE_NODE_ROLE_NAME} def _get_log_group_name_for_cluster(cluster_name): @@ -95,8 +95,8 @@ def __init__(self, scheduler, os, cluster, feature_key=None, shared_dir=DEFAULT_ self.shared_dir = self._get_shared_dir(shared_dir) self.remote_command_executor = RemoteCommandExecutor(self.cluster) self.scheduler_commands = get_scheduler_commands(self.scheduler, self.remote_command_executor) - self._relevant_logs = {MASTER_NODE_ROLE_NAME: [], COMPUTE_NODE_ROLE_NAME: []} - self._cluster_log_state = {MASTER_NODE_ROLE_NAME: {}, COMPUTE_NODE_ROLE_NAME: {}} + self._relevant_logs = {HEAD_NODE_ROLE_NAME: [], COMPUTE_NODE_ROLE_NAME: []} + self._cluster_log_state = {HEAD_NODE_ROLE_NAME: {}, COMPUTE_NODE_ROLE_NAME: {}} self._set_cluster_log_state() @property @@ -115,7 +115,7 @@ def is_feature_specific(self): def get_logs_state(self): """Get the state of the log files applicable to each of the cluster's EC2 instances.""" desired_keys = ["hostname", "instance_id", "node_role", "agent_status", "logs"] - states = [{key: self._cluster_log_state.get(MASTER_NODE_ROLE_NAME).get(key) for key in desired_keys}] + states = [{key: self._cluster_log_state.get(HEAD_NODE_ROLE_NAME).get(key) for key in desired_keys}] states.extend( [ {key: host_dict[key] for key in desired_keys} @@ -145,11 +145,11 @@ def _base_os_to_platform(base_os): no_digits = base_os.rstrip(string.digits) return translations.get(no_digits, no_digits) - def _set_master_instance(self, instance): - """Set the master instance field in self.cluster_log_state.""" - self._cluster_log_state.get(MASTER_NODE_ROLE_NAME).update( + def _set_head_node_instance(self, instance): + """Set the head node instance field in self.cluster_log_state.""" + self._cluster_log_state.get(HEAD_NODE_ROLE_NAME).update( { - "node_role": MASTER_NODE_ROLE_NAME, + "node_role": HEAD_NODE_ROLE_NAME, "hostname": instance.get("PrivateDnsName"), "instance_id": instance.get("InstanceId"), } @@ -157,7 +157,7 @@ def _set_master_instance(self, instance): def _add_compute_instance(self, instance): """Update the cluster's log state by adding a compute node.""" - compute_hostname = self._run_command_on_master( + compute_hostname = self._run_command_on_head_node( "ssh -o StrictHostKeyChecking=no -q {} hostname -f".format(instance.get("PrivateDnsName")) ) self._cluster_log_state[COMPUTE_NODE_ROLE_NAME][compute_hostname] = { @@ -173,24 +173,24 @@ def _get_initial_cluster_log_state(self): if tags.get("ClusterName", "") != self.cluster.name: continue elif tags.get("Name", "") == "Master": - self._set_master_instance(instance) + self._set_head_node_instance(instance) else: self._add_compute_instance(instance) LOGGER.debug("After getting initial cluster state:\n{0}".format(self._dump_cluster_log_state())) - def _read_log_configs_from_master(self): + def _read_log_configs_from_head_node(self): """Read the log configs file at /usr/local/etc/cloudwatch_log_files.json.""" read_cmd = "cat /usr/local/etc/cloudwatch_log_files.json" - config = json.loads(self._run_command_on_master(read_cmd)) + config = json.loads(self._run_command_on_head_node(read_cmd)) return config.get("log_configs") - def _read_master_node_config(self): - """Read the node configuration JSON file at NODE_CONFIG_PATH on the master node.""" + def _read_head_node_config(self): + """Read the node configuration JSON file at NODE_CONFIG_PATH on the head node.""" read_cmd = "cat {0}".format(NODE_CONFIG_PATH) - master_node_config = json.loads(self._run_command_on_master(read_cmd)).get("cfncluster", {}) - assert_that(master_node_config).is_not_empty() - LOGGER.info("DNA config read from master node: {0}".format(_dump_json(master_node_config))) - return master_node_config + head_node_config = json.loads(self._run_command_on_head_node(read_cmd)).get("cfncluster", {}) + assert_that(head_node_config).is_not_empty() + LOGGER.info("DNA config read from head node: {0}".format(_dump_json(head_node_config))) + return head_node_config def _read_compute_node_config(self): """Read the node configuration JSON file at NODE_CONFIG_PATH on a compute node.""" @@ -209,7 +209,7 @@ def _read_compute_node_config(self): def _read_node_configs(self): """Return a dict mapping node role names to the config at NODE_CONFIG_PATH.""" return { - MASTER_NODE_ROLE_NAME: self._read_master_node_config(), + HEAD_NODE_ROLE_NAME: self._read_head_node_config(), COMPUTE_NODE_ROLE_NAME: self._read_compute_node_config(), } @@ -273,7 +273,7 @@ def _populate_relevant_logs_for_node_roles(self, logs): """Populate self._relevant_logs with the entries of logs.""" # When the scheduler is AWS Batch, only keep log that whose config's node_role value is MasterServer, since # Batch doesn't have compute nodes in the traditional sense. - desired_node_roles = {MASTER_NODE_ROLE_NAME} if self.scheduler == "awsbatch" else NODE_ROLE_NAMES + desired_node_roles = {HEAD_NODE_ROLE_NAME} if self.scheduler == "awsbatch" else NODE_ROLE_NAMES for log in logs: for node_role in set(log.get("node_roles")) & desired_node_roles: self._relevant_logs[node_role].append(self._clean_log_config(log)) @@ -286,8 +286,8 @@ def _filter_logs(self, logs): def _create_log_entries_for_nodes(self): """Create an entry for each relevant log in self._cluster_log_state.""" - self._cluster_log_state[MASTER_NODE_ROLE_NAME]["logs"] = { - log.get("file_path"): log for log in self._relevant_logs.get(MASTER_NODE_ROLE_NAME) + self._cluster_log_state[HEAD_NODE_ROLE_NAME]["logs"] = { + log.get("file_path"): log for log in self._relevant_logs.get(HEAD_NODE_ROLE_NAME) } for _hostname, compute_instance_dict in self._cluster_log_state.get(COMPUTE_NODE_ROLE_NAME).items(): compute_instance_dict["logs"] = { @@ -296,19 +296,19 @@ def _create_log_entries_for_nodes(self): def _get_relevant_logs(self): """Get subset of all log configs that apply to this cluster's scheduler/os combo.""" - logs = self._read_log_configs_from_master() + logs = self._read_log_configs_from_head_node() self._filter_logs(logs) self._create_log_entries_for_nodes() LOGGER.debug("After populating relevant logs:\n{0}".format(self._dump_cluster_log_state())) - def _run_command_on_master(self, cmd): + def _run_command_on_head_node(self, cmd): """Run cmd on cluster's MasterServer.""" return self.remote_command_executor.run_remote_command(cmd, timeout=60).stdout.strip() def _run_command_on_computes(self, cmd, assert_success=True): """Run cmd on all computes in the cluster.""" # Create directory in self.shared_dir to direct outputs to - out_dir = Path(self._run_command_on_master("mktemp -d -p {shared_dir}".format(shared_dir=self.shared_dir))) + out_dir = Path(self._run_command_on_head_node("mktemp -d -p {shared_dir}".format(shared_dir=self.shared_dir))) redirect = " > {out_dir}/$(hostname -f) ".format(out_dir=out_dir) remote_cmd = cmd.format(redirect=redirect) @@ -321,17 +321,17 @@ def _run_command_on_computes(self, cmd, assert_success=True): # Read the output and map it to the hostname outputs = {} - result_files = self._run_command_on_master("ls {0}".format(out_dir)) + result_files = self._run_command_on_head_node("ls {0}".format(out_dir)) for hostname in result_files.split(): - outputs[hostname] = self._run_command_on_master("sudo cat {0}".format(out_dir / hostname)) - self._run_command_on_master("rm -rf {0}".format(out_dir)) + outputs[hostname] = self._run_command_on_head_node("sudo cat {0}".format(out_dir / hostname)) + self._run_command_on_head_node("rm -rf {0}".format(out_dir)) return outputs - def _populate_master_log_existence(self): + def _populate_head_node_log_existence(self): """Figure out which of the relevant logs for the MasterServer don't exist.""" - for log_path, log_dict in self._cluster_log_state.get(MASTER_NODE_ROLE_NAME).get("logs").items(): + for log_path, log_dict in self._cluster_log_state.get(HEAD_NODE_ROLE_NAME).get("logs").items(): cmd = "[ -f {path} ] && echo exists || echo does not exist".format(path=log_path) - output = self._run_command_on_master(cmd) + output = self._run_command_on_head_node(cmd) log_dict["exists"] = output == "exists" def _populate_compute_log_existence(self): @@ -354,16 +354,16 @@ def _populate_compute_log_existence(self): def _populate_log_existence(self): """Figure out which of the relevant logs for each node type don't exist.""" - self._populate_master_log_existence() + self._populate_head_node_log_existence() self._populate_compute_log_existence() LOGGER.debug("After populating log existence:\n{0}".format(self._dump_cluster_log_state())) - def _populate_master_log_emptiness_and_tail(self): + def _populate_head_node_log_emptiness_and_tail(self): """Figure out which of the relevant logs for the MasterServer are empty.""" - for log_path, log_dict in self._cluster_log_state.get(MASTER_NODE_ROLE_NAME).get("logs").items(): + for log_path, log_dict in self._cluster_log_state.get(HEAD_NODE_ROLE_NAME).get("logs").items(): if not log_dict.get("exists"): continue - output = self._run_command_on_master("sudo tail -n 1 {path}".format(path=log_path)) + output = self._run_command_on_head_node("sudo tail -n 1 {path}".format(path=log_path)) log_dict["is_empty"] = output == "" log_dict["tail"] = output @@ -392,15 +392,15 @@ def _populate_compute_log_emptiness_and_tail(self): def _populate_log_emptiness_and_tail(self): """Figure out which of the relevant logs for each node type are empty.""" - self._populate_master_log_emptiness_and_tail() + self._populate_head_node_log_emptiness_and_tail() self._populate_compute_log_emptiness_and_tail() LOGGER.debug("After populating log emptiness and tails:\n{0}".format(self._dump_cluster_log_state())) - def _populate_master_agent_status(self): + def _populate_head_node_agent_status(self): """Get the cloudwatch agent's status for the MasterServer.""" status_cmd = "/opt/aws/amazon-cloudwatch-agent/bin/amazon-cloudwatch-agent-ctl -a status" - status = json.loads(self._run_command_on_master(status_cmd)) - self._cluster_log_state[MASTER_NODE_ROLE_NAME]["agent_status"] = status.get("status") + status = json.loads(self._run_command_on_head_node(status_cmd)) + self._cluster_log_state[HEAD_NODE_ROLE_NAME]["agent_status"] = status.get("status") def _populate_compute_agent_status(self): """Get the cloudwatch agent's status for all the compute nodes in the cluster.""" @@ -414,7 +414,7 @@ def _populate_compute_agent_status(self): def _populate_agent_status(self): """Get the cloudwatch agent's status for all the nodes in the cluster.""" - self._populate_master_agent_status() + self._populate_head_node_agent_status() self._populate_compute_agent_status() LOGGER.debug("After populating agent statuses:\n{0}".format(self._dump_cluster_log_state())) diff --git a/tests/integration-tests/tests/create/test_create.py b/tests/integration-tests/tests/create/test_create.py index 0e5079bfc1..d80929462c 100644 --- a/tests/integration-tests/tests/create/test_create.py +++ b/tests/integration-tests/tests/create/test_create.py @@ -76,7 +76,7 @@ def _assert_head_node_is_running(region, cluster): logging.info("Asserting the head node is running") head_node_state = ( boto3.client("ec2", region_name=region) - .describe_instances(Filters=[{"Name": "ip-address", "Values": [cluster.master_ip]}]) + .describe_instances(Filters=[{"Name": "ip-address", "Values": [cluster.head_node_ip]}]) .get("Reservations")[0] .get("Instances")[0] .get("State") diff --git a/tests/integration-tests/tests/dcv/test_dcv.py b/tests/integration-tests/tests/dcv/test_dcv.py index c7ddb9e789..3b25b1e1a3 100644 --- a/tests/integration-tests/tests/dcv/test_dcv.py +++ b/tests/integration-tests/tests/dcv/test_dcv.py @@ -119,13 +119,13 @@ def _test_dcv_configuration( # add ssh key to jenkins user known hosts file to avoid ssh keychecking prompt host_keys_file = operating_system.path.expanduser("~/.ssh/known_hosts") - add_keys_to_known_hosts(cluster.master_ip, host_keys_file) + add_keys_to_known_hosts(cluster.head_node_ip, host_keys_file) try: result = run_command(["pcluster", "dcv", "connect", cluster.name, "--show-url"], env=env) finally: # remove ssh key from jenkins user known hosts file - remove_keys_from_known_hosts(cluster.master_ip, host_keys_file, env=env) + remove_keys_from_known_hosts(cluster.head_node_ip, host_keys_file, env=env) assert_that(result.stdout).matches( r"Please use the following one-time URL in your browser within 30 seconds:\n" diff --git a/tests/integration-tests/tests/multiple_nics/test_multiple_nics.py b/tests/integration-tests/tests/multiple_nics/test_multiple_nics.py index a865bec94b..f68bfc951b 100644 --- a/tests/integration-tests/tests/multiple_nics/test_multiple_nics.py +++ b/tests/integration-tests/tests/multiple_nics/test_multiple_nics.py @@ -27,7 +27,7 @@ def test_multiple_nics(scheduler, region, pcluster_config_reader, clusters_facto remote_command_executor = RemoteCommandExecutor(cluster) scheduler_commands = get_scheduler_commands(scheduler, remote_command_executor) - _test_master_node_nics(remote_command_executor, region) + _test_head_node_nics(remote_command_executor, region) _test_compute_node_nics(cluster, region, remote_command_executor, scheduler_commands) @@ -40,16 +40,16 @@ def _get_private_ip_addresses(instance_id, region, remote_command_executor): return result.stdout.strip().split("\n") -def _test_master_node_nics(remote_command_executor, region): - # On the master node we just check that all the private IPs have been assigned to NICs - master_instance_id = remote_command_executor.run_remote_command( +def _test_head_node_nics(remote_command_executor, region): + # On the head node we just check that all the private IPs have been assigned to NICs + head_node_instance_id = remote_command_executor.run_remote_command( "curl http://169.254.169.254/latest/meta-data/instance-id" ).stdout - master_ip_addresses = _get_private_ip_addresses(master_instance_id, region, remote_command_executor) + head_node_ip_addresses = _get_private_ip_addresses(head_node_instance_id, region, remote_command_executor) ip_a_result = remote_command_executor.run_remote_command("ip a").stdout - for ip_address in master_ip_addresses: + for ip_address in head_node_ip_addresses: assert_that(ip_a_result).matches(".* inet {0}.*".format(ip_address)) diff --git a/tests/integration-tests/tests/storage/snapshots_factory.py b/tests/integration-tests/tests/storage/snapshots_factory.py index 6e9017a986..be208ee747 100644 --- a/tests/integration-tests/tests/storage/snapshots_factory.py +++ b/tests/integration-tests/tests/storage/snapshots_factory.py @@ -18,7 +18,7 @@ from retrying import retry from utils import random_alphanumeric -SnapshotConfig = namedtuple("ClusterConfig", ["ssh_key", "key_name", "vpc_id", "master_subnet_id"]) +SnapshotConfig = namedtuple("ClusterConfig", ["ssh_key", "key_name", "vpc_id", "head_node_subnet_id"]) class EBSSnapshotsFactory: @@ -84,7 +84,7 @@ def _create_volume_process(self, region, snapshot_config): self.security_group_id = self._get_security_group_id() - subnet = self.ec2.Subnet(self.config.master_subnet_id) + subnet = self.ec2.Subnet(self.config.head_node_subnet_id) # Create a new volume and attach to the instance self.volume = self._create_volume(subnet) diff --git a/tests/integration-tests/tests/storage/storage_common.py b/tests/integration-tests/tests/storage/storage_common.py index 7528ff180d..14353a4680 100644 --- a/tests/integration-tests/tests/storage/storage_common.py +++ b/tests/integration-tests/tests/storage/storage_common.py @@ -13,13 +13,13 @@ def verify_directory_correctly_shared(remote_command_executor, mount_dir, scheduler_commands): - master_file = random_alphanumeric() + head_node_file = random_alphanumeric() compute_file = random_alphanumeric() remote_command_executor.run_remote_command( - "touch {mount_dir}/{master_file}".format(mount_dir=mount_dir, master_file=master_file) + "touch {mount_dir}/{head_node_file}".format(mount_dir=mount_dir, head_node_file=head_node_file) ) - job_command = "cat {mount_dir}/{master_file} && touch {mount_dir}/{compute_file}".format( - mount_dir=mount_dir, master_file=master_file, compute_file=compute_file + job_command = "cat {mount_dir}/{head_node_file} && touch {mount_dir}/{compute_file}".format( + mount_dir=mount_dir, head_node_file=head_node_file, compute_file=compute_file ) result = scheduler_commands.submit_command(job_command) diff --git a/tests/integration-tests/tests/storage/test_efs.py b/tests/integration-tests/tests/storage/test_efs.py index fd4a5efee8..2e458b6b81 100644 --- a/tests/integration-tests/tests/storage/test_efs.py +++ b/tests/integration-tests/tests/storage/test_efs.py @@ -223,11 +223,11 @@ def _test_efs_correctly_mounted(remote_command_executor, mount_dir): def _assert_subnet_az_relations(region, vpc_stack, expected_in_same_az): vpc = get_vpc_snakecase_value(vpc_stack) - master_subnet_id = vpc["public_subnet_id"] + head_node_subnet_id = vpc["public_subnet_id"] compute_subnet_id = vpc["private_subnet_id"] if expected_in_same_az else vpc["private_additional_cidr_subnet_id"] - master_subnet_az = boto3.resource("ec2", region_name=region).Subnet(master_subnet_id).availability_zone + head_node_subnet_az = boto3.resource("ec2", region_name=region).Subnet(head_node_subnet_id).availability_zone compute_subnet_az = boto3.resource("ec2", region_name=region).Subnet(compute_subnet_id).availability_zone if expected_in_same_az: - assert_that(master_subnet_az).is_equal_to(compute_subnet_az) + assert_that(head_node_subnet_az).is_equal_to(compute_subnet_az) else: - assert_that(master_subnet_az).is_not_equal_to(compute_subnet_az) + assert_that(head_node_subnet_az).is_not_equal_to(compute_subnet_az) diff --git a/util/cfn-stacks-generators/generate-efs-substack.py b/util/cfn-stacks-generators/generate-efs-substack.py index 22a5ebaa9d..2aad838606 100644 --- a/util/cfn-stacks-generators/generate-efs-substack.py +++ b/util/cfn-stacks-generators/generate-efs-substack.py @@ -7,7 +7,7 @@ def main(args): t = Template() # [0 shared_dir, 1 efs_fs_id, 2 performance_mode, 3 efs_kms_key_id, - # 4 provisioned_throughput, 5 encrypted, 6 throughput_mode, 7 exists_valid_master_mt, 8 exists_valid_compute_mt] + # 4 provisioned_throughput, 5 encrypted, 6 throughput_mode, 7 exists_valid_head_node_mt, 8 exists_valid_compute_mt] efs_options = t.add_parameter( Parameter( "EFSOptions", @@ -18,7 +18,7 @@ def main(args): compute_security_group = t.add_parameter( Parameter("ComputeSecurityGroup", Type="String", Description="Security Group for Mount Target") ) - master_subnet_id = t.add_parameter( + head_node_subnet_id = t.add_parameter( Parameter("MasterSubnetId", Type="String", Description="Master subnet id for master mount target") ) compute_subnet_id = t.add_parameter( @@ -33,7 +33,7 @@ def main(args): "CreateEFS", And(Not(Equals(Select(str(0), Ref(efs_options)), "NONE")), Equals(Select(str(1), Ref(efs_options)), "NONE")), ) - create_master_mt = t.add_condition( + create_head_node_mt = t.add_condition( "CreateMasterMT", And(Not(Equals(Select(str(0), Ref(efs_options)), "NONE")), Equals(Select(str(7), Ref(efs_options)), "NONE")), ) @@ -43,10 +43,10 @@ def main(args): ) # Need to create compute mount target if: # user is providing a compute subnet and - # there is no existing MT in compute subnet's AZ(includes case where master AZ == compute AZ). + # there is no existing MT in compute subnet's AZ(includes case where head node AZ == compute AZ). # - # If user is not providing a compute subnet, either we are using the master subnet as compute subnet, - # or we will be creating a compute subnet that is in the same AZ as master subnet, + # If user is not providing a compute subnet, either we are using the head node subnet as compute subnet, + # or we will be creating a compute subnet that is in the same AZ as head node subnet, # see ComputeSubnet resource in the main stack. # In both cases no compute MT is needed. create_compute_mt = t.add_condition( @@ -82,8 +82,8 @@ def main(args): "MasterSubnetEFSMT", FileSystemId=If(create_efs, Ref(fs), Select(str(1), Ref(efs_options))), SecurityGroups=[Ref(compute_security_group)], - SubnetId=Ref(master_subnet_id), - Condition=create_master_mt, + SubnetId=Ref(head_node_subnet_id), + Condition=create_head_node_mt, ) ) From 1d9db149a4de5d96d0d90d149596b99c6613c4eb Mon Sep 17 00:00:00 2001 From: Enrico Usai Date: Mon, 30 Nov 2020 11:49:47 +0100 Subject: [PATCH 19/66] Rename master_* to head_node_* in comments Signed-off-by: Enrico Usai --- README.md | 4 ++-- cli/README | 4 ++-- cli/src/awsbatch/examples/awsbatch-cli.cfg | 2 +- cli/src/pcluster/config/cfn_param_types.py | 14 ++++++------- cli/src/pcluster/config/mappings.py | 2 +- cli/src/pcluster/config/resource_map.py | 2 +- cli/src/pcluster/config/validators.py | 2 +- cli/src/pcluster/configure/easyconfig.py | 2 +- cli/src/pcluster/configure/networking.py | 4 ++-- cli/src/pcluster/dcv/connect.py | 4 ++-- cli/src/pcluster/examples/config | 12 +++++------ .../pcluster/models/sit/sit_cluster_model.py | 4 ++-- cli/src/pcluster/utils.py | 20 +++++++++---------- .../pcluster/config/test_section_cluster.py | 4 ++-- .../pcluster.config.ini | 2 +- .../pcluster.config.ini | 2 +- tests/integration-tests/README.md | 2 +- tests/integration-tests/clusters_factory.py | 2 +- tests/integration-tests/conftest.py | 2 +- .../remote_command_executor.py | 8 ++++---- .../tests/cfn-init/test_cfn_init.py | 2 +- .../test_cloudwatch_logging.py | 14 ++++++------- .../tests/common/schedulers_common.py | 2 +- .../test_disable_hyperthreading.py | 2 +- tests/integration-tests/tests/efa/test_efa.py | 2 +- .../tests/intel_hpc/test_intel_hpc.py | 2 +- .../tests/runtime_bake/test_runtime_bake.py | 2 +- .../cluster-check.sh | 2 +- .../tests/schedulers/test_torque.py | 2 +- .../tests/storage/test_efs.py | 4 ++-- .../test_efs_compute_az/pcluster.config.ini | 2 +- .../generate-efs-substack.py | 2 +- 32 files changed, 68 insertions(+), 68 deletions(-) diff --git a/README.md b/README.md index f0d3dc4a27..de2ef0c40d 100644 --- a/README.md +++ b/README.md @@ -70,9 +70,9 @@ Automate VPC creation? (y/n) [n]: Enter ``n`` if you already have a VPC suitable for the cluster. Otherwise you can let ``pcluster configure`` create a VPC for you. The same choice is given for the subnet: you can select a valid subnet ID for -both the master and compute nodes, or you can let ``pcluster configure`` set up everything for you. +both the head node and compute nodes, or you can let ``pcluster configure`` set up everything for you. The same choice is given for the subnet configuration: you can select a valid subnet ID for both -the master and compute nodes, or you can let pcluster configure set up everything for you. +the head node and compute nodes, or you can let pcluster configure set up everything for you. In the latter case, just select the configuration you prefer. ``` diff --git a/cli/README b/cli/README index 1a942019d7..8c412c530b 100644 --- a/cli/README +++ b/cli/README @@ -17,11 +17,11 @@ You can build higher level workflows, such as a Genomics portal that automates t update Updates a running cluster. delete Deletes a cluster. start Starts the compute fleet that has been stopped. - stop Stops the compute fleet, but leave the master server running for debugging/development. + stop Stops the compute fleet, but leave the head node running for debugging/development. status Pulls the current status of the cluster. list Displays a list of stacks associated with AWS ParallelCluster. instances Displays a list of all instances in a cluster. - ssh Connects to the master server using SSH. + ssh Connects to the head node using SSH. configure Starts the AWS ParallelCluster configuration. version Displays version of AWS ParallelCluster. createami (Linux/macOS) Creates a custom AMI to use with AWS ParallelCluster. diff --git a/cli/src/awsbatch/examples/awsbatch-cli.cfg b/cli/src/awsbatch/examples/awsbatch-cli.cfg index 164fcc2e48..6b92bc381e 100644 --- a/cli/src/awsbatch/examples/awsbatch-cli.cfg +++ b/cli/src/awsbatch/examples/awsbatch-cli.cfg @@ -37,7 +37,7 @@ job_definition = arn:aws:batch:::job-definition/parallelclus job_definition_mnp = arn:aws:batch:::job-definition/parallelcluster--mnp:1 # HTTP(S) proxy server, typically http://x.x.x.x:8080, used for internal boto3 calls proxy = NONE -# Private Master IP, used internally in the job submission phase. +# Private head node IP, used internally in the job submission phase. master_ip = x.x.x.x # Environment blacklist variables # Comma separated list of environment variable names to not export when submitting a job with "--env all" parameter diff --git a/cli/src/pcluster/config/cfn_param_types.py b/cli/src/pcluster/config/cfn_param_types.py index 4d84b61a69..84cd70efbb 100644 --- a/cli/src/pcluster/config/cfn_param_types.py +++ b/cli/src/pcluster/config/cfn_param_types.py @@ -590,7 +590,7 @@ class HeadNodeAvailabilityZoneCfnParam(AvailabilityZoneCfnParam): """ def from_file(self, config_parser): - """Initialize the Availability zone of the cluster by checking the Master Subnet.""" + """Initialize the Availability zone of the cluster by checking the head node Subnet.""" self._init_az(config_parser, "master_subnet_id") return self @@ -667,7 +667,7 @@ def to_cfn(self): """ Define the Cores CFN parameter if disable_hyperthreading = true. - :return: string (cores_master,cores_compute,master_supports_cpu_options,compute_supports_cpu_options) + :return: string (head_node_cores,compute_cores,head_node_supports_cpu_options,compute_supports_cpu_options) """ cfn_params = {self.definition.get("cfn_param_mapping"): "NONE,NONE,NONE,NONE"} cluster_config = self.pcluster_config.get_section(self.section_key) @@ -798,7 +798,7 @@ class BaseOSCfnParam(CfnParam): @staticmethod def get_instance_type_architecture(instance_type): - """Compute cluster's 'Architecture' CFN parameter based on its master server instance type.""" + """Compute cluster's 'Architecture' CFN parameter based on its head node instance type.""" if not instance_type: error("Cannot infer architecture without master instance type") head_node_supported_architectures = get_supported_architectures_for_instance_type(instance_type) @@ -808,7 +808,7 @@ def get_instance_type_architecture(instance_type): # If the instance type supports multiple architectures, choose the first one. # TODO: this is currently not an issue because none of the instance types we support more than one of the # architectures we support. If this were ever to change (e.g., we start supporting i386) then we would - # probably need to choose based on the subset of the architecutres supported by both the master and + # probably need to choose based on the subset of the architecutres supported by both the head node and # compute instance types. return head_node_supported_architectures[0] @@ -1077,11 +1077,11 @@ class NetworkInterfacesCountCfnParam(CommaSeparatedCfnParam): Class to manage NetworkInterfacesCount Cfn param. The internal value is a list of two items, which respectively indicate the number of network interfaces to activate - on master and compute nodes. + on head node and compute nodes. """ def refresh(self): - """Compute the number of network interfaces for master and compute nodes.""" + """Compute the number of network interfaces for head node and compute nodes.""" cluster_section = self.pcluster_config.get_section("cluster") scheduler = cluster_section.get_param_value("scheduler") self.value = [ @@ -1235,7 +1235,7 @@ def to_storage(self, storage_params=None): compute_mt_valid = bool(compute_mount_target_id) cfn_items.append("Valid" if head_node_mt_valid else "NONE") - # Do not create additional compute mount target if compute and master subnet in the same AZ + # Do not create additional compute mount target if compute and head node subnet in the same AZ cfn_items.append("Valid" if compute_mt_valid or (head_node_avail_zone == compute_avail_zone) else "NONE") cfn_params[cfn_converter] = ",".join(cfn_items) diff --git a/cli/src/pcluster/config/mappings.py b/cli/src/pcluster/config/mappings.py index 38e9b64699..2cf7221d79 100644 --- a/cli/src/pcluster/config/mappings.py +++ b/cli/src/pcluster/config/mappings.py @@ -768,7 +768,7 @@ "required": True, "update_policy": UpdatePolicy.UNSUPPORTED }), - # Master + # Head node ("master_instance_type", { "default": "t2.micro", "cfn_param_mapping": "MasterInstanceType", diff --git a/cli/src/pcluster/config/resource_map.py b/cli/src/pcluster/config/resource_map.py index 4776ed47c5..4220869574 100644 --- a/cli/src/pcluster/config/resource_map.py +++ b/cli/src/pcluster/config/resource_map.py @@ -22,7 +22,7 @@ class ResourceArray(object): """ Represents a set of available resources for a single resource type. - For instance, this class can represent the available EBS volume resources that can be attached to a master node. + For instance, this class can represent the available EBS volume resources that can be attached to a head node. """ def __init__(self, resources): diff --git a/cli/src/pcluster/config/validators.py b/cli/src/pcluster/config/validators.py index 684cd9e79d..b0ac26d3b7 100644 --- a/cli/src/pcluster/config/validators.py +++ b/cli/src/pcluster/config/validators.py @@ -952,7 +952,7 @@ def cluster_validator(section_key, section_label, pcluster_config): def instances_architecture_compatibility_validator(param_key, param_value, pcluster_config): - """Verify that master and compute instance types imply compatible architectures.""" + """Verify that head node and compute instance types imply compatible architectures.""" errors = [] warnings = [] diff --git a/cli/src/pcluster/configure/easyconfig.py b/cli/src/pcluster/configure/easyconfig.py index f4b0741906..57ac022f70 100644 --- a/cli/src/pcluster/configure/easyconfig.py +++ b/cli/src/pcluster/configure/easyconfig.py @@ -419,7 +419,7 @@ def get_parameters_to_reset(self): def cache_qualified_az(self): """ - Call API once for both master and compute instance type. + Call API once for both head node and compute instance type. Cache is done inside get get_supported_az_for_instance_types. """ diff --git a/cli/src/pcluster/configure/networking.py b/cli/src/pcluster/configure/networking.py index dff6834677..291abcac5b 100644 --- a/cli/src/pcluster/configure/networking.py +++ b/cli/src/pcluster/configure/networking.py @@ -88,7 +88,7 @@ def _get_availability_zone(self): class PublicNetworkConfig(BaseNetworkConfig): - """The public configuration that creates one public subnet with master and compute fleet.""" + """The public configuration that creates one public subnet with head node and compute fleet.""" def __init__(self, availability_zones=None): super(PublicNetworkConfig, self).__init__( @@ -115,7 +115,7 @@ def _create(self, vpc_id, vpc_cidr, subnet_cidrs, internet_gateway_id, compute_s class PublicPrivateNetworkConfig(BaseNetworkConfig): - """The publicprivate configuration that creates one public subnet for master and one private subnet for compute.""" + """The public private config that creates one public subnet for head node and one private subnet for compute.""" def __init__(self, availability_zones=None): super(PublicPrivateNetworkConfig, self).__init__( diff --git a/cli/src/pcluster/dcv/connect.py b/cli/src/pcluster/dcv/connect.py index caa6f99f3d..2bb4afacae 100644 --- a/cli/src/pcluster/dcv/connect.py +++ b/cli/src/pcluster/dcv/connect.py @@ -40,7 +40,7 @@ def dcv_connect(args): # Parse configuration file to read the AWS section PclusterConfig.init_aws() # FIXME it always searches for the default configuration file - # Prepare ssh command to execute in the master instance + # Prepare ssh command to execute in the head node instance stack = get_stack(get_stack_name(args.cluster_name)) shared_dir = get_cfn_param(stack.get("Parameters"), "SharedDir") head_node_ip, username = get_head_node_ip_and_username(args.cluster_name) @@ -74,7 +74,7 @@ def dcv_connect(args): def _retrieve_dcv_session_url(ssh_cmd, cluster_name, head_node_ip): - """Connect by ssh to the master instance, prepare DCV session and return the DCV session URL.""" + """Connect by ssh to the head node instance, prepare DCV session and return the DCV session URL.""" try: LOGGER.debug("SSH command: {0}".format(ssh_cmd)) output = _check_command_output(ssh_cmd) diff --git a/cli/src/pcluster/examples/config b/cli/src/pcluster/examples/config index 201172f06d..4179a54f56 100644 --- a/cli/src/pcluster/examples/config +++ b/cli/src/pcluster/examples/config @@ -23,7 +23,7 @@ key_name = mykey # Override path to cloudformation in S3 # (defaults to https://s3.amazonaws.com/-aws-parallelcluster/templates/aws-parallelcluster-.cfn.json) #template_url = https://s3.amazonaws.com/-aws-parallelcluster/templates/aws-parallelcluster-.cfn.json -# EC2 instance type for master node +# EC2 instance type for head node # (defaults to t2.micro) #master_instance_type = t2.micro # EC2 instance type for compute nodes @@ -94,7 +94,7 @@ key_name = mykey # Encrypted ephemeral drives. In-memory keys, non-recoverable. # (defaults to false) #encrypted_ephemeral = false -# MasterServer root volume size in GB. (AMI must support growroot) +# Head node root volume size in GB. (AMI must support growroot) # (defaults to 25) #master_root_volume_size = 25 # ComputeFleet root volume size in GB. (AMI must support growroot) @@ -137,7 +137,7 @@ vpc_settings = public [vpc public] # ID of the VPC you want to provision cluster into. vpc_id = vpc-12345678 -# ID of the Subnet you want to provision the Master server into +# ID of the Subnet you want to provision the head node into master_subnet_id = subnet-12345678 # SSH from CIDR # This is only used when AWS ParallelCluster creates the security group @@ -153,7 +153,7 @@ master_subnet_id = subnet-12345678 #[vpc private-new] # ID of the VPC you want to provision cluster into. #vpc_id = vpc-12345678 -# ID of the Subnet you want to provision the Master server into +# ID of the Subnet you want to provision the head node into #master_subnet_id = subnet-12345678 # CIDR for new backend subnet i.e. 10.0.100.0/24 #compute_subnet_cidr = 10.0.100.0/24 @@ -161,7 +161,7 @@ master_subnet_id = subnet-12345678 #[vpc private-existing] # ID of the VPC you want to provision cluster into. #vpc_id = vpc-12345678 -# ID of the Subnet you want to provision the Master server into +# ID of the Subnet you want to provision the head node into #master_subnet_id = subnet-12345678 # CIDR for new backend subnet i.e. 10.0.100.0/24 #compute_subnet_id = subnet-23456789 @@ -184,7 +184,7 @@ master_subnet_id = subnet-12345678 # Use encrypted volume (should not be used with snapshots) # (defaults to false) #encrypted = false -# Existing EBS volume to be attached to the MasterServer +# Existing EBS volume to be attached to the head node # (defaults to NONE) #ebs_volume_id = NONE diff --git a/cli/src/pcluster/models/sit/sit_cluster_model.py b/cli/src/pcluster/models/sit/sit_cluster_model.py index b831dc9aed..11cc291691 100644 --- a/cli/src/pcluster/models/sit/sit_cluster_model.py +++ b/cli/src/pcluster/models/sit/sit_cluster_model.py @@ -117,7 +117,7 @@ def test_configuration(self, pcluster_config): head_node_network_interfaces = self.build_launch_network_interfaces( network_interfaces_count=int(cluster_section.get_param_value("network_interfaces_count")[0]), - use_efa=False, # EFA is not supported on master node + use_efa=False, # EFA is not supported on head node security_group_ids=security_groups_ids, subnet=head_node_subnet, use_public_ips=vpc_section.get_param_value("use_public_ips"), @@ -138,7 +138,7 @@ def test_configuration(self, pcluster_config): compute_network_interfaces_count = int(cluster_section.get_param_value("network_interfaces_count")[1]) enable_efa = "compute" == cluster_section.get_param_value("enable_efa") - # TODO: check if master == compute subnet condition is to take into account + # TODO: check if head node == compute subnet condition is to take into account use_public_ips = self.public_ips_in_compute_subnet(pcluster_config, compute_network_interfaces_count) network_interfaces = self.build_launch_network_interfaces( diff --git a/cli/src/pcluster/utils.py b/cli/src/pcluster/utils.py index 5f8704c219..5ada244ae8 100644 --- a/cli/src/pcluster/utils.py +++ b/cli/src/pcluster/utils.py @@ -49,7 +49,7 @@ class NodeType(Enum): """Enum that identifies the cluster node type.""" - master = "Master" + head_node = "Master" compute = "Compute" def __str__(self): @@ -842,22 +842,22 @@ def describe_cluster_instances(stack_name, node_type): def _get_head_node_ip(stack_name): """ - Get the IP Address of the MasterServer. + Get the IP Address of the head node. :param stack_name: The name of the cloudformation stack :param config: Config object :return private/public ip address """ - instances = describe_cluster_instances(stack_name, node_type=NodeType.master) + instances = describe_cluster_instances(stack_name, node_type=NodeType.head_node) if not instances: - error("MasterServer not running. Can't SSH") + error("Head node not running. Can't SSH") head_node = instances[0] ip_address = head_node.get("PublicIpAddress") if ip_address is None: ip_address = head_node.get("PrivateIpAddress") state = head_node.get("State").get("Name") if state != "running" or ip_address is None: - error("MasterServer: {0}\nCannot get ip address.".format(state.upper())) + error("Head node: {0}\nCannot get ip address.".format(state.upper())) return ip_address @@ -870,7 +870,7 @@ def get_head_node_ip_and_username(cluster_name): stack_status = stack_result.get("StackStatus") if stack_status in ["DELETE_COMPLETE", "DELETE_IN_PROGRESS"]: - error("Unable to retrieve master_ip and username for a stack in the status: {0}".format(stack_status)) + error("Unable to retrieve head node ip and username for a stack in the status: {0}".format(stack_status)) else: head_node_ip = _get_head_node_ip(stack_name) template = cfn.get_template(StackName=stack_name) @@ -891,14 +891,14 @@ def get_head_node_ip_and_username(cluster_name): def get_head_node_state(stack_name): """ - Get the State of the MasterServer. + Get the State of the head node. :param stack_name: The name of the cloudformation stack - :return master server state name + :return head node state name """ instances = describe_cluster_instances(stack_name, "Master") if not instances: - error("MasterServer not running.") + error("Head node not running.") return instances[0].get("State").get("Name") @@ -991,7 +991,7 @@ def get_batch_ce(stack_name): """ Get name of the AWS Batch Compute Environment. - :param stack_name: name of the master stack + :param stack_name: name of the head node stack :param config: config :return: ce_name or exit if not found """ diff --git a/cli/tests/pcluster/config/test_section_cluster.py b/cli/tests/pcluster/config/test_section_cluster.py index 2ccde19064..c94840063c 100644 --- a/cli/tests/pcluster/config/test_section_cluster.py +++ b/cli/tests/pcluster/config/test_section_cluster.py @@ -265,7 +265,7 @@ def test_hit_cluster_section_from_file(mocker, config_parser_dict, expected_dict ("placement", "wrong_value", None, "has an invalid value"), ("placement", "NONE", None, "has an invalid value"), ("placement", "cluster", "cluster", None), - # Master + # Head node # TODO add regex for master_instance_type ("master_instance_type", None, "t2.micro", None), ("master_instance_type", "", "", None), @@ -541,7 +541,7 @@ def test_sit_cluster_param_from_file( ("shared_dir", "/test//test2", None, "has an invalid value"), ("shared_dir", "/test\\test2", None, "has an invalid value"), ("shared_dir", "NONE", "NONE", None), # NONE is evaluated as a valid path - # Master + # Head node # TODO add regex for master_instance_type ("master_instance_type", None, "t2.micro", None), ("master_instance_type", "", "", None), diff --git a/cli/tests/pcluster/config/test_section_efs/test_efs_from_file_to_cfn/pcluster.config.ini b/cli/tests/pcluster/config/test_section_efs/test_efs_from_file_to_cfn/pcluster.config.ini index dada4b6d7c..3ba70269b6 100644 --- a/cli/tests/pcluster/config/test_section_efs/test_efs_from_file_to_cfn/pcluster.config.ini +++ b/cli/tests/pcluster/config/test_section_efs/test_efs_from_file_to_cfn/pcluster.config.ini @@ -13,7 +13,7 @@ scheduler = slurm base_os = alinux2 [vpc default] -# EFS conversion requires master subnet id to check mount-target avail zone +# EFS conversion requires head node subnet id to check mount-target avail zone master_subnet_id = subnet-12345678 compute_subnet_id = subnet-23456789 diff --git a/cli/tests/pcluster/config/test_section_fsx/test_fsx_from_file_to_cfn/pcluster.config.ini b/cli/tests/pcluster/config/test_section_fsx/test_fsx_from_file_to_cfn/pcluster.config.ini index 89d8fdc419..3ac84626d3 100644 --- a/cli/tests/pcluster/config/test_section_fsx/test_fsx_from_file_to_cfn/pcluster.config.ini +++ b/cli/tests/pcluster/config/test_section_fsx/test_fsx_from_file_to_cfn/pcluster.config.ini @@ -13,7 +13,7 @@ scheduler = slurm base_os = alinux2 [vpc default] -# FSX conversion requires master subnet id to check mount-target +# FSX conversion requires head node subnet id to check mount-target master_subnet_id = subnet-12345678 [fsx test1] diff --git a/tests/integration-tests/README.md b/tests/integration-tests/README.md index 9e4b9fff85..ac78f2c275 100644 --- a/tests/integration-tests/README.md +++ b/tests/integration-tests/README.md @@ -706,7 +706,7 @@ included the CloudFormation stack outputs. ### Execute Remote Commands -To execute remote commands or scripts on the Master instance of the cluster under test, the `RemoteCommandExecutor` +To execute remote commands or scripts on the head node of the cluster under test, the `RemoteCommandExecutor` class can be used. It simply requires a valid `Cluster` object to be initialized and it offers some utility methods to execute remote commands and scripts as shown in the example below: diff --git a/tests/integration-tests/clusters_factory.py b/tests/integration-tests/clusters_factory.py index 52e4b455fa..87585a4673 100644 --- a/tests/integration-tests/clusters_factory.py +++ b/tests/integration-tests/clusters_factory.py @@ -161,7 +161,7 @@ def region(self): @property def head_node_ip(self): - """Return the public ip of the cluster master node.""" + """Return the public ip of the cluster head node.""" if "MasterPublicIP" in self.cfn_outputs: return self.cfn_outputs["MasterPublicIP"] else: diff --git a/tests/integration-tests/conftest.py b/tests/integration-tests/conftest.py index a4f892a321..55babf21f7 100644 --- a/tests/integration-tests/conftest.py +++ b/tests/integration-tests/conftest.py @@ -561,7 +561,7 @@ def vpc_stacks(cfn_stacks_factory, request): for region in regions: # Creating private_subnet_different_cidr in a different AZ for test_efs - # To-do: isolate this logic and create a compute subnet in different AZ than master in test_efs + # To-do: isolate this logic and create a compute subnet in different AZ than head node in test_efs # if region has a non-empty list in AVAILABILITY_ZONE_OVERRIDES, select a subset of those AZs credential = request.config.getoption("credential") diff --git a/tests/integration-tests/remote_command_executor.py b/tests/integration-tests/remote_command_executor.py index 686db3adfb..3088bbcf33 100644 --- a/tests/integration-tests/remote_command_executor.py +++ b/tests/integration-tests/remote_command_executor.py @@ -25,7 +25,7 @@ def __init__(self, result): class RemoteCommandExecutor: - """Execute remote commands on the cluster master node.""" + """Execute remote commands on the cluster head node.""" def __init__(self, cluster, username=None): if not username: @@ -57,7 +57,7 @@ def run_remote_command( timeout=None, ): """ - Execute remote command on the cluster master node. + Execute remote command on the cluster head node. :param command: command to execute. :param log_error: log errors. @@ -95,9 +95,9 @@ def run_remote_script( self, script_file, args=None, log_error=True, additional_files=None, hide=False, timeout=None, run_as_root=False ): """ - Execute a script remotely on the cluster master node. + Execute a script remotely on the cluster head node. - Script is copied to the master home dir before being executed. + Script is copied to the head node home dir before being executed. :param script_file: local path to the script to execute remotely. :param args: args to pass to the script when invoked. :param log_error: log errors. diff --git a/tests/integration-tests/tests/cfn-init/test_cfn_init.py b/tests/integration-tests/tests/cfn-init/test_cfn_init.py index 0b04217583..408de4dffc 100644 --- a/tests/integration-tests/tests/cfn-init/test_cfn_init.py +++ b/tests/integration-tests/tests/cfn-init/test_cfn_init.py @@ -72,7 +72,7 @@ def test_install_args_quotes(region, pcluster_config_reader, clusters_factory, s init_config_file = pcluster_config_reader(bucket_name=bucket_name) cluster = clusters_factory(init_config_file) - # Check master and compute node status + # Check head node and compute node status _assert_server_status(cluster) diff --git a/tests/integration-tests/tests/cloudwatch_logging/test_cloudwatch_logging.py b/tests/integration-tests/tests/cloudwatch_logging/test_cloudwatch_logging.py index af8f3727d6..7c1b3cad04 100644 --- a/tests/integration-tests/tests/cloudwatch_logging/test_cloudwatch_logging.py +++ b/tests/integration-tests/tests/cloudwatch_logging/test_cloudwatch_logging.py @@ -122,7 +122,7 @@ def get_logs_state(self): for hostname, host_dict in self._cluster_log_state.get(COMPUTE_NODE_ROLE_NAME).items() ] ) - assert_that(states).is_length(self.compute_nodes_count + 1) # computes + master + assert_that(states).is_length(self.compute_nodes_count + 1) # computes + head node return states @staticmethod @@ -302,7 +302,7 @@ def _get_relevant_logs(self): LOGGER.debug("After populating relevant logs:\n{0}".format(self._dump_cluster_log_state())) def _run_command_on_head_node(self, cmd): - """Run cmd on cluster's MasterServer.""" + """Run cmd on cluster's head node.""" return self.remote_command_executor.run_remote_command(cmd, timeout=60).stdout.strip() def _run_command_on_computes(self, cmd, assert_success=True): @@ -328,7 +328,7 @@ def _run_command_on_computes(self, cmd, assert_success=True): return outputs def _populate_head_node_log_existence(self): - """Figure out which of the relevant logs for the MasterServer don't exist.""" + """Figure out which of the relevant logs for the head node don't exist.""" for log_path, log_dict in self._cluster_log_state.get(HEAD_NODE_ROLE_NAME).get("logs").items(): cmd = "[ -f {path} ] && echo exists || echo does not exist".format(path=log_path) output = self._run_command_on_head_node(cmd) @@ -359,7 +359,7 @@ def _populate_log_existence(self): LOGGER.debug("After populating log existence:\n{0}".format(self._dump_cluster_log_state())) def _populate_head_node_log_emptiness_and_tail(self): - """Figure out which of the relevant logs for the MasterServer are empty.""" + """Figure out which of the relevant logs for the head node are empty.""" for log_path, log_dict in self._cluster_log_state.get(HEAD_NODE_ROLE_NAME).get("logs").items(): if not log_dict.get("exists"): continue @@ -397,7 +397,7 @@ def _populate_log_emptiness_and_tail(self): LOGGER.debug("After populating log emptiness and tails:\n{0}".format(self._dump_cluster_log_state())) def _populate_head_node_agent_status(self): - """Get the cloudwatch agent's status for the MasterServer.""" + """Get the cloudwatch agent's status for the head node.""" status_cmd = "/opt/aws/amazon-cloudwatch-agent/bin/amazon-cloudwatch-agent-ctl -a status" status = json.loads(self._run_command_on_head_node(status_cmd)) self._cluster_log_state[HEAD_NODE_ROLE_NAME]["agent_status"] = status.get("status") @@ -424,7 +424,7 @@ def _set_cluster_log_state(self): In particular: * Identify which EC2 instances belong to this cluster - * Identify which logs are relevant to the MasterServer and ComputeFleet nodes + * Identify which logs are relevant to the head node and compute fleet nodes * Identify whether each of a node's relevant logs contain data or not. If they do contain data, save the last line of the file. * Get the CloudWatch agent's status for each node @@ -511,7 +511,7 @@ def verify_log_group_retention_days(self, log_groups, cluster_has_been_deleted): ) def verify_agent_status(self, logs_state): - """Verify CloudWatch agent is running on the MasterServer (or not if not enabled).""" + """Verify CloudWatch agent is running on the head node (or not if not enabled).""" expected_status = "running" if self.enabled else "stopped" assert_that(logs_state).extracting("agent_status").contains_only(expected_status) diff --git a/tests/integration-tests/tests/common/schedulers_common.py b/tests/integration-tests/tests/common/schedulers_common.py index 46373a588c..974a433776 100644 --- a/tests/integration-tests/tests/common/schedulers_common.py +++ b/tests/integration-tests/tests/common/schedulers_common.py @@ -541,7 +541,7 @@ def get_compute_nodes(self): # noqa: D102 @retry(retry_on_result=lambda result: "offline" not in result, wait_fixed=seconds(5), stop_max_delay=minutes(5)) def wait_for_locked_node(self): # noqa: D102 - # discard the first node since that is the master server + # discard the first node since that is the head node return self._remote_command_executor.run_remote_command(r'pbsnodes | grep -e "\sstate = " | tail -n +2').stdout def get_node_cores(self): diff --git a/tests/integration-tests/tests/disable_hyperthreading/test_disable_hyperthreading.py b/tests/integration-tests/tests/disable_hyperthreading/test_disable_hyperthreading.py index 0b001fba0d..603fe401d2 100644 --- a/tests/integration-tests/tests/disable_hyperthreading/test_disable_hyperthreading.py +++ b/tests/integration-tests/tests/disable_hyperthreading/test_disable_hyperthreading.py @@ -90,7 +90,7 @@ def _test_disable_hyperthreading_settings( result = remote_command_executor.run_remote_command("lscpu") if partition: # If partition is supplied, assume this is HIT setting where ht settings are at the queue level - # In this case, ht is not disabled on master + # In this case, ht is not disabled on head node assert_that(result.stdout).matches(r"Thread\(s\) per core:\s+{0}".format(2)) _assert_active_cpus(result.stdout, slots_per_instance) else: diff --git a/tests/integration-tests/tests/efa/test_efa.py b/tests/integration-tests/tests/efa/test_efa.py index 5c12bb75c7..bece1e555b 100644 --- a/tests/integration-tests/tests/efa/test_efa.py +++ b/tests/integration-tests/tests/efa/test_efa.py @@ -115,7 +115,7 @@ def _test_efa_installation(scheduler_commands, remote_command_executor, efa_inst else: assert_that(result.stdout).does_not_contain("1d0f:efa0") - # Check EFA interface not present on master + # Check EFA interface not present on head node result = remote_command_executor.run_remote_command("lspci -n") assert_that(result.stdout).does_not_contain("1d0f:efa0") diff --git a/tests/integration-tests/tests/intel_hpc/test_intel_hpc.py b/tests/integration-tests/tests/intel_hpc/test_intel_hpc.py index 3651c80c2f..44a7471253 100644 --- a/tests/integration-tests/tests/intel_hpc/test_intel_hpc.py +++ b/tests/integration-tests/tests/intel_hpc/test_intel_hpc.py @@ -35,7 +35,7 @@ def test_intel_hpc(region, scheduler, instance, os, pcluster_config_reader, clus def _test_intel_clck(remote_command_executor, scheduler_commands, test_datadir, os): - # Install Intel Cluster Checker CLCK Master + # Install Intel Cluster Checker CLCK on head node logging.info("Installing Intel Cluster Checker") remote_command_executor.run_remote_script(str(test_datadir / "install_clck.sh"), hide=False) diff --git a/tests/integration-tests/tests/runtime_bake/test_runtime_bake.py b/tests/integration-tests/tests/runtime_bake/test_runtime_bake.py index de0668ae14..9daa739a4a 100644 --- a/tests/integration-tests/tests/runtime_bake/test_runtime_bake.py +++ b/tests/integration-tests/tests/runtime_bake/test_runtime_bake.py @@ -40,7 +40,7 @@ def test_runtime_bake(scheduler, os, region, pcluster_config_reader, clusters_fa remote_command_executor = RemoteCommandExecutor(cluster) # Verify no chef.io endpoint is called in cloud-init-output log to download chef installer or chef packages""" - # on master + # on head node remote_command_executor.run_remote_script(str(test_datadir / "verify_chef_download.sh")) # on compute scheduler_commands = get_scheduler_commands(scheduler, remote_command_executor) diff --git a/tests/integration-tests/tests/scaling/test_scaling/test_multiple_jobs_submission/cluster-check.sh b/tests/integration-tests/tests/scaling/test_scaling/test_multiple_jobs_submission/cluster-check.sh index 58c4b97108..4ace1ae7b2 100755 --- a/tests/integration-tests/tests/scaling/test_scaling/test_multiple_jobs_submission/cluster-check.sh +++ b/tests/integration-tests/tests/scaling/test_scaling/test_multiple_jobs_submission/cluster-check.sh @@ -19,7 +19,7 @@ # minutes, one of which (hopefully) requires scaling (note that # scaling is currently not tested on Torque, because it's too big of a # pain to determine how many slots per node are on a Torque compute -# node from the master node). +# node from the head node). # # Usage: diff --git a/tests/integration-tests/tests/schedulers/test_torque.py b/tests/integration-tests/tests/schedulers/test_torque.py index 12434bd5ba..677a9107fb 100644 --- a/tests/integration-tests/tests/schedulers/test_torque.py +++ b/tests/integration-tests/tests/schedulers/test_torque.py @@ -192,7 +192,7 @@ def _test_dynamic_cluster_limits(remote_command_executor, max_queue_size, max_sl # Make sure cluster is scaled to 0 when this test starts assert_that(torque_commands.compute_nodes_count()).is_equal_to(0) - # sleeping for 1 second to give time to sqswatcher to reconfigure the master with np = max_nodes * node_slots + # sleeping for 1 second to give time to sqswatcher to reconfigure the head node with np = max_nodes * node_slots # operation that is performed right after sqswatcher removes the compute nodes from the scheduler time.sleep(1) _assert_scheduler_configuration(remote_command_executor, torque_commands, max_slots, max_queue_size) diff --git a/tests/integration-tests/tests/storage/test_efs.py b/tests/integration-tests/tests/storage/test_efs.py index 2e458b6b81..b718b65099 100644 --- a/tests/integration-tests/tests/storage/test_efs.py +++ b/tests/integration-tests/tests/storage/test_efs.py @@ -35,7 +35,7 @@ @pytest.mark.usefixtures("region", "os", "instance") def test_efs_compute_az(region, scheduler, pcluster_config_reader, clusters_factory, vpc_stack): """ - Test when compute subnet is in a different AZ from master subnet. + Test when compute subnet is in a different AZ from head node subnet. A compute mount target should be created and the efs correctly mounted on compute. """ @@ -57,7 +57,7 @@ def test_efs_compute_az(region, scheduler, pcluster_config_reader, clusters_fact @pytest.mark.usefixtures("region", "os", "instance") def test_efs_same_az(region, scheduler, pcluster_config_reader, clusters_factory, vpc_stack): """ - Test when compute subnet is in the same AZ as master subnet. + Test when compute subnet is in the same AZ as head node subnet. No compute mount point needed and the efs correctly mounted on compute. """ diff --git a/tests/integration-tests/tests/storage/test_efs/test_efs_compute_az/pcluster.config.ini b/tests/integration-tests/tests/storage/test_efs/test_efs_compute_az/pcluster.config.ini index 2c916fbb2d..9fda919812 100644 --- a/tests/integration-tests/tests/storage/test_efs/test_efs_compute_az/pcluster.config.ini +++ b/tests/integration-tests/tests/storage/test_efs/test_efs_compute_az/pcluster.config.ini @@ -23,7 +23,7 @@ efs_settings = efs [vpc parallelcluster-vpc] vpc_id = {{ vpc_id }} master_subnet_id = {{ public_subnet_id }} -# This compute subnet would be in a different AZ than master for regions defined in AVAILABILITY_ZONE_OVERRIDES +# This compute subnet would be in a different AZ than head node for regions defined in AVAILABILITY_ZONE_OVERRIDES # See conftest for details compute_subnet_id = {{ private_additional_cidr_subnet_id }} diff --git a/util/cfn-stacks-generators/generate-efs-substack.py b/util/cfn-stacks-generators/generate-efs-substack.py index 2aad838606..3e2548b511 100644 --- a/util/cfn-stacks-generators/generate-efs-substack.py +++ b/util/cfn-stacks-generators/generate-efs-substack.py @@ -19,7 +19,7 @@ def main(args): Parameter("ComputeSecurityGroup", Type="String", Description="Security Group for Mount Target") ) head_node_subnet_id = t.add_parameter( - Parameter("MasterSubnetId", Type="String", Description="Master subnet id for master mount target") + Parameter("MasterSubnetId", Type="String", Description="Head node subnet id for head node mount target") ) compute_subnet_id = t.add_parameter( Parameter( From 9ea0071493fc748e7b0de59a0bfe9c000bc0491e Mon Sep 17 00:00:00 2001 From: Enrico Usai Date: Mon, 30 Nov 2020 12:03:22 +0100 Subject: [PATCH 20/66] Rename master_* to head_node_* in secondary user facing messages Signed-off-by: Enrico Usai --- CHANGELOG.md | 3 ++- README.md | 4 ++-- cli/src/pcluster/cli.py | 8 ++++---- cli/src/pcluster/commands.py | 2 +- cli/src/pcluster/config/cfn_param_types.py | 2 +- cli/src/pcluster/config/update_policy.py | 2 +- cli/src/pcluster/config/validators.py | 6 +++--- cli/src/pcluster/configure/easyconfig.py | 10 +++++----- cli/src/pcluster/configure/networking.py | 4 ++-- cli/src/pcluster/dcv/connect.py | 4 ++-- cli/src/pcluster/utils.py | 2 +- cli/tests/pcluster/config/test_validators.py | 4 ++-- .../test_bad_config_file/output.txt | 4 ++-- .../test_filtered_subnets_by_az/output.txt | 4 ++-- .../output.txt | 4 ++-- .../output.txt | 4 ++-- .../output.txt | 4 ++-- .../output.txt | 4 ++-- .../test_region_env_overwrite_region_config/output.txt | 4 ++-- .../output.txt | 4 ++-- .../output.txt | 4 ++-- .../output.txt | 4 ++-- .../test_unexisting_instance_type/output.txt | 4 ++-- .../output.txt | 4 ++-- .../test_vpc_automation_no_vpc_in_region/output.txt | 4 ++-- .../output.txt | 4 ++-- .../output.txt | 4 ++-- .../test_with_region_arg_with_config_file/output.txt | 4 ++-- .../tests/configure/test_pcluster_configure.py | 6 +++--- .../test_disable_hyperthreading.py | 4 ++-- 30 files changed, 63 insertions(+), 62 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index dbbe6324c5..dd17a5f52a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,6 +14,7 @@ CHANGELOG - Pull Amazon Linux Docker images from ECR when building docker image for `awsbatch` scheduler. This only applies to images built for `x86` architecture. +- Use inclusive language in user facing messages and internal naming convention. **BUG FIXES** @@ -24,7 +25,7 @@ CHANGELOG - Add support for CentOS 8 in all Commercial regions. - Add support for P4d instance type as compute node. -- Add the possibilty to enable NVIDIA GPUDirect RDMA support on EFA by using the new `enable_efa_gdr` configuration +- Add the possibility to enable NVIDIA GPUDirect RDMA support on EFA by using the new `enable_efa_gdr` configuration parameter. - Enable support for NICE DCV in GovCloud regions. - Enable support for AWS Batch scheduler in GovCloud regions. diff --git a/README.md b/README.md index de2ef0c40d..30faa7c7cb 100644 --- a/README.md +++ b/README.md @@ -78,8 +78,8 @@ In the latter case, just select the configuration you prefer. ``` Automate Subnet creation? (y/n) [y]: y Allowed values for Network Configuration: -1. Master in a public subnet and compute fleet in a private subnet -2. Master and compute fleet in the same public subnet +1. Head node in a public subnet and compute fleet in a private subnet +2. Head node and compute fleet in the same public subnet ``` diff --git a/cli/src/pcluster/cli.py b/cli/src/pcluster/cli.py index 22e7084d42..c2e8a85eb8 100644 --- a/cli/src/pcluster/cli.py +++ b/cli/src/pcluster/cli.py @@ -272,9 +272,9 @@ def _get_parser(): # stop command subparser pstop = subparsers.add_parser( "stop", - help="Stops the compute fleet, leaving the master server running.", + help="Stops the compute fleet, leaving the head node running.", epilog="This command sets the Auto Scaling Group parameters to min/max/desired = 0/0/0 and " - "terminates the compute fleet. The master will remain running. To terminate " + "terminates the compute fleet. The head node will remain running. To terminate " "all EC2 resources and avoid EC2 charges, consider deleting the cluster.", ) pstop.add_argument("cluster_name", help="Stops the compute fleet of the cluster name provided here.") @@ -331,7 +331,7 @@ def _get_parser(): ) pssh = subparsers.add_parser( "ssh", - help="Connects to the master instance using SSH.", + help="Connects to the head node instance using SSH.", description="Run ssh command with the cluster username and IP address pre-populated. " "Arbitrary arguments are appended to the end of the ssh command. " "This command can be customized in the aliases " @@ -429,7 +429,7 @@ def _get_parser(): dcv_subparsers.required = True dcv_subparsers.dest = "subcommand" pdcv_connect = dcv_subparsers.add_parser( - "connect", help="Permits to connect to the master node through an interactive session by using NICE DCV." + "connect", help="Permits to connect to the head node through an interactive session by using NICE DCV." ) _addarg_region(pdcv_connect) pdcv_connect.add_argument("cluster_name", help="Name of the cluster to connect to") diff --git a/cli/src/pcluster/commands.py b/cli/src/pcluster/commands.py index 5e5fe1fd68..1501fb70e4 100644 --- a/cli/src/pcluster/commands.py +++ b/cli/src/pcluster/commands.py @@ -429,7 +429,7 @@ def _poll_head_node_state(stack_name): try: instances = utils.describe_cluster_instances(stack_name, node_type=utils.NodeType.head_node) if not instances: - LOGGER.error("Cannot retrieve master node status. Exiting...") + LOGGER.error("Cannot retrieve head node status. Exiting...") sys.exit(1) head_node_id = instances[0].get("InstanceId") state = instances[0].get("State").get("Name") diff --git a/cli/src/pcluster/config/cfn_param_types.py b/cli/src/pcluster/config/cfn_param_types.py index 84cd70efbb..3c7bf976c9 100644 --- a/cli/src/pcluster/config/cfn_param_types.py +++ b/cli/src/pcluster/config/cfn_param_types.py @@ -800,7 +800,7 @@ class BaseOSCfnParam(CfnParam): def get_instance_type_architecture(instance_type): """Compute cluster's 'Architecture' CFN parameter based on its head node instance type.""" if not instance_type: - error("Cannot infer architecture without master instance type") + error("Cannot infer architecture without head node instance type") head_node_supported_architectures = get_supported_architectures_for_instance_type(instance_type) if not head_node_supported_architectures: diff --git a/cli/src/pcluster/config/update_policy.py b/cli/src/pcluster/config/update_policy.py index 0d80773f96..3c0195f39a 100644 --- a/cli/src/pcluster/config/update_policy.py +++ b/cli/src/pcluster/config/update_policy.py @@ -213,7 +213,7 @@ def _check_generated_bucket(change, patch): # Update supported only with head node down UpdatePolicy.HEAD_NODE_STOP = UpdatePolicy( level=20, - fail_reason="To perform this update action, the master node must be in a stopped state", + fail_reason="To perform this update action, the head node must be in a stopped state", action_needed=UpdatePolicy.ACTIONS_NEEDED["pcluster_stop"], condition_checker=lambda change, patch: utils.get_head_node_state(patch.stack_name) == "stopped", ) diff --git a/cli/src/pcluster/config/validators.py b/cli/src/pcluster/config/validators.py index b0ac26d3b7..1e09fff2a7 100644 --- a/cli/src/pcluster/config/validators.py +++ b/cli/src/pcluster/config/validators.py @@ -387,7 +387,7 @@ def dcv_enabled_validator(param_key, param_value, pcluster_config): if re.search(r"(micro)|(nano)", head_node_instance_type): warnings.append( "The packages required for desktop virtualization in the selected instance type '{0}' " - "may cause instability of the master instance. If you want to use NICE DCV it is recommended " + "may cause instability of the head node instance. If you want to use NICE DCV it is recommended " "to use an instance type with at least 1.7 GB of memory.".format(head_node_instance_type) ) @@ -625,7 +625,7 @@ def ec2_ami_validator(param_key, param_value, pcluster_config): if cluster_section.get_param_value("architecture") != ami_architecture: errors.append( "AMI {0}'s architecture ({1}) is incompatible with the architecture supported by the instance type " - "chosen for the master server ({2}). Use either a different AMI or a different instance type.".format( + "chosen for the head node ({2}). Use either a different AMI or a different instance type.".format( param_value, ami_architecture, cluster_section.get_param_value("architecture") ) ) @@ -1075,7 +1075,7 @@ def intel_hpc_architecture_validator(param_key, param_value, pcluster_config): architecture = pcluster_config.get_section("cluster").get_param_value("architecture") if param_value and architecture not in allowed_architectures: errors.append( - "When using enable_intel_hpc_platform = {0} it is required to use master and compute instance " + "When using enable_intel_hpc_platform = {0} it is required to use head node and compute instance " "types and an AMI that support these architectures: {1}".format(param_value, allowed_architectures) ) diff --git a/cli/src/pcluster/configure/easyconfig.py b/cli/src/pcluster/configure/easyconfig.py index 57ac022f70..ac707c3120 100644 --- a/cli/src/pcluster/configure/easyconfig.py +++ b/cli/src/pcluster/configure/easyconfig.py @@ -266,13 +266,13 @@ def _filter_subnets_offering_instance_type(subnet_list, instance_type): def _ask_for_subnets(subnet_list, vpc_section, qualified_head_node_subnets, qualified_compute_subnets): head_node_subnet_id = _prompt_for_subnet( - vpc_section.get_param_value("master_subnet_id"), subnet_list, qualified_head_node_subnets, "Master Subnet ID" + vpc_section.get_param_value("master_subnet_id"), subnet_list, qualified_head_node_subnets, "head node Subnet ID" ) compute_subnet_id = _prompt_for_subnet( vpc_section.get_param_value("compute_subnet_id") or head_node_subnet_id, subnet_list, qualified_compute_subnets, - "Compute Subnet ID", + "compute Subnet ID", ) vpc_parameters = {"master_subnet_id": head_node_subnet_id} @@ -294,8 +294,8 @@ def _choose_network_configuration(cluster_config): # Automate subnet creation only allows subnets to reside in a single az. # But user can bypass it by using manual subnets creation during configure or modify the config file directly. print( - "Error: There is no single availability zone offering master and compute in current region.\n" - "To create your cluster, make sure you have a subnet for master node in {0}" + "Error: There is no single availability zone offering head node and compute in current region.\n" + "To create your cluster, make sure you have a subnet for head node in {0}" ", and a subnet for compute nodes in {1}. Then run pcluster configure again" "and avoid using Automate VPC/Subnet creation.".format(azs_for_head_node_type, azs_for_compute_type) ) @@ -368,7 +368,7 @@ def prompt_os(self): def prompt_instance_types(self): """Ask for head_node_instance_type and compute_instance_type (if necessary).""" self.head_node_instance_type = prompt( - "Master instance type", + "Head node instance type", lambda x: _is_instance_type_supported_for_head_node(x) and x in get_supported_instance_types(), default_value=self.cluster_section.get_param_value("master_instance_type"), ) diff --git a/cli/src/pcluster/configure/networking.py b/cli/src/pcluster/configure/networking.py index 291abcac5b..de21f770fe 100644 --- a/cli/src/pcluster/configure/networking.py +++ b/cli/src/pcluster/configure/networking.py @@ -92,7 +92,7 @@ class PublicNetworkConfig(BaseNetworkConfig): def __init__(self, availability_zones=None): super(PublicNetworkConfig, self).__init__( - config_type="Master and compute fleet in the same public subnet", + config_type="Head node and compute fleet in the same public subnet", template_name="public", stack_name_prefix="pub", availability_zones=availability_zones, @@ -119,7 +119,7 @@ class PublicPrivateNetworkConfig(BaseNetworkConfig): def __init__(self, availability_zones=None): super(PublicPrivateNetworkConfig, self).__init__( - config_type="Master in a public subnet and compute fleet in a private subnet", + config_type="Head node in a public subnet and compute fleet in a private subnet", template_name="public-private", stack_name_prefix="pubpriv", availability_zones=availability_zones, diff --git a/cli/src/pcluster/dcv/connect.py b/cli/src/pcluster/dcv/connect.py index 2bb4afacae..f1b1fc01e2 100644 --- a/cli/src/pcluster/dcv/connect.py +++ b/cli/src/pcluster/dcv/connect.py @@ -59,7 +59,7 @@ def dcv_connect(args): error( "Something went wrong during DCV connection.\n{0}" "Please check the logs in the /var/log/parallelcluster/ folder " - "of the master instance and submit an issue {1}\n".format(e, PCLUSTER_ISSUES_LINK) + "of the head node and submit an issue {1}\n".format(e, PCLUSTER_ISSUES_LINK) ) if args.show_url: @@ -93,7 +93,7 @@ def _retrieve_dcv_session_url(ssh_cmd, cluster_name, head_node_ip): error( "Something went wrong during DCV connection. Please manually execute the command:\n{0}\n" "If the problem persists, please check the logs in the /var/log/parallelcluster/ folder " - "of the master instance and submit an issue {1}".format(ssh_cmd, PCLUSTER_ISSUES_LINK) + "of the head node and submit an issue {1}".format(ssh_cmd, PCLUSTER_ISSUES_LINK) ) except sub.CalledProcessError as e: diff --git a/cli/src/pcluster/utils.py b/cli/src/pcluster/utils.py index 5ada244ae8..62d7ba49cc 100644 --- a/cli/src/pcluster/utils.py +++ b/cli/src/pcluster/utils.py @@ -857,7 +857,7 @@ def _get_head_node_ip(stack_name): ip_address = head_node.get("PrivateIpAddress") state = head_node.get("State").get("Name") if state != "running" or ip_address is None: - error("Head node: {0}\nCannot get ip address.".format(state.upper())) + error("MasterServer: {0}\nCannot get ip address.".format(state.upper())) return ip_address diff --git a/cli/tests/pcluster/config/test_validators.py b/cli/tests/pcluster/config/test_validators.py index b457bda99e..e603216d50 100644 --- a/cli/tests/pcluster/config/test_validators.py +++ b/cli/tests/pcluster/config/test_validators.py @@ -147,12 +147,12 @@ def test_ec2_key_pair_validator(mocker, boto3_stubber): ( "arm64", None, - "incompatible with the architecture supported by the instance type chosen for the master server", + "incompatible with the architecture supported by the instance type chosen for the head node", ), ( "arm64", "Unable to get information for AMI", - "incompatible with the architecture supported by the instance type chosen for the master server", + "incompatible with the architecture supported by the instance type chosen for the head node", ), ], ) diff --git a/cli/tests/pcluster/configure/test_pcluster_configure/test_bad_config_file/output.txt b/cli/tests/pcluster/configure/test_pcluster_configure/test_bad_config_file/output.txt index 5d09937dda..d7bef6e2fc 100644 --- a/cli/tests/pcluster/configure/test_pcluster_configure/test_bad_config_file/output.txt +++ b/cli/tests/pcluster/configure/test_pcluster_configure/test_bad_config_file/output.txt @@ -51,7 +51,7 @@ Allowed values for VPC ID: 3 vpc-34567891 default 3 4 vpc-45678912 ParallelClusterVPC-20190626095403 1 Allowed values for Network Configuration: -1. Master in a public subnet and compute fleet in a private subnet -2. Master and compute fleet in the same public subnet +1. Head node in a public subnet and compute fleet in a private subnet +2. Head node and compute fleet in the same public subnet Configuration file written to {{ CONFIG_FILE }} You can edit your configuration file or simply run 'pcluster create -c {{ CONFIG_FILE }} cluster-name' to create your cluster diff --git a/cli/tests/pcluster/configure/test_pcluster_configure/test_filtered_subnets_by_az/output.txt b/cli/tests/pcluster/configure/test_pcluster_configure/test_filtered_subnets_by_az/output.txt index cc832b0c57..c65d2bd4ad 100644 --- a/cli/tests/pcluster/configure/test_pcluster_configure/test_filtered_subnets_by_az/output.txt +++ b/cli/tests/pcluster/configure/test_pcluster_configure/test_filtered_subnets_by_az/output.txt @@ -46,12 +46,12 @@ Allowed values for VPC ID: 3 vpc-34567891 default 3 4 vpc-45678912 ParallelClusterVPC-20190626095403 1 Note: 2 subnet(s) is/are not listed, because the instance type is not in their availability zone(s) -Allowed values for Master Subnet ID: +Allowed values for head node Subnet ID: # id name size availability_zone --- --------------- ------ ------ ------------------- 1 subnet-45678912 4096 eu-west-1a Note: 2 subnet(s) is/are not listed, because the instance type is not in their availability zone(s) -Allowed values for Compute Subnet ID: +Allowed values for compute Subnet ID: # id name size availability_zone --- --------------- ------ ------ ------------------- 1 subnet-45678912 4096 eu-west-1a diff --git a/cli/tests/pcluster/configure/test_pcluster_configure/test_no_automation_no_awsbatch_no_errors/output.txt b/cli/tests/pcluster/configure/test_pcluster_configure/test_no_automation_no_awsbatch_no_errors/output.txt index d07d619a28..5bc8c99931 100644 --- a/cli/tests/pcluster/configure/test_pcluster_configure/test_no_automation_no_awsbatch_no_errors/output.txt +++ b/cli/tests/pcluster/configure/test_pcluster_configure/test_no_automation_no_awsbatch_no_errors/output.txt @@ -45,12 +45,12 @@ Allowed values for VPC ID: 2 vpc-23456789 ParallelClusterVPC-20190624105051 0 3 vpc-34567891 default 3 4 vpc-45678912 ParallelClusterVPC-20190626095403 1 -Allowed values for Master Subnet ID: +Allowed values for head node Subnet ID: # id name size availability_zone --- --------------- ---------------------------- ------ ------------------- 1 subnet-12345678 ParallelClusterPublicSubnet 256 eu-west-1b 2 subnet-23456789 ParallelClusterPrivateSubnet 4096 eu-west-1b -Allowed values for Compute Subnet ID: +Allowed values for compute Subnet ID: # id name size availability_zone --- --------------- ---------------------------- ------ ------------------- 1 subnet-12345678 ParallelClusterPublicSubnet 256 eu-west-1b diff --git a/cli/tests/pcluster/configure/test_pcluster_configure/test_no_automation_yes_awsbatch_no_errors/output.txt b/cli/tests/pcluster/configure/test_pcluster_configure/test_no_automation_yes_awsbatch_no_errors/output.txt index 372869e07e..c6ab2cbec7 100644 --- a/cli/tests/pcluster/configure/test_pcluster_configure/test_no_automation_yes_awsbatch_no_errors/output.txt +++ b/cli/tests/pcluster/configure/test_pcluster_configure/test_no_automation_yes_awsbatch_no_errors/output.txt @@ -38,12 +38,12 @@ Allowed values for VPC ID: 2 vpc-23456789 ParallelClusterVPC-20190624105051 0 3 vpc-34567891 default 3 4 vpc-45678912 ParallelClusterVPC-20190626095403 1 -Allowed values for Master Subnet ID: +Allowed values for head node Subnet ID: # id name size availability_zone --- --------------- ---------------------------- ------ ------------------- 1 subnet-12345678 ParallelClusterPublicSubnet 256 eu-west-1b 2 subnet-23456789 ParallelClusterPrivateSubnet 4096 eu-west-1b -Allowed values for Compute Subnet ID: +Allowed values for compute Subnet ID: # id name size availability_zone --- --------------- ---------------------------- ------ ------------------- 1 subnet-12345678 ParallelClusterPublicSubnet 256 eu-west-1b diff --git a/cli/tests/pcluster/configure/test_pcluster_configure/test_no_available_no_input_no_automation_no_errors_with_config_file/output.txt b/cli/tests/pcluster/configure/test_pcluster_configure/test_no_available_no_input_no_automation_no_errors_with_config_file/output.txt index aa697307ab..fdedfe9131 100644 --- a/cli/tests/pcluster/configure/test_pcluster_configure/test_no_available_no_input_no_automation_no_errors_with_config_file/output.txt +++ b/cli/tests/pcluster/configure/test_pcluster_configure/test_no_available_no_input_no_automation_no_errors_with_config_file/output.txt @@ -28,12 +28,12 @@ Allowed values for VPC ID: 2 vpc-bcdefghi ParallelClusterVPC-20190624105051 0 3 vpc-cdefghij default 3 4 vpc-abdbabcb ParallelClusterVPC-20190626095403 1 -Allowed values for Master Subnet ID: +Allowed values for head node Subnet ID: # id name size availability_zone --- --------------- ---------------------------- ------ ------------------- 1 subnet-77777777 ParallelClusterPublicSubnet 256 cn-north-1a 2 subnet-66666666 ParallelClusterPrivateSubnet 4096 cn-north-1a -Allowed values for Compute Subnet ID: +Allowed values for compute Subnet ID: # id name size availability_zone --- --------------- ---------------------------- ------ ------------------- 1 subnet-77777777 ParallelClusterPublicSubnet 256 cn-north-1a diff --git a/cli/tests/pcluster/configure/test_pcluster_configure/test_no_input_no_automation_no_errors_with_config_file/output.txt b/cli/tests/pcluster/configure/test_pcluster_configure/test_no_input_no_automation_no_errors_with_config_file/output.txt index 0c5dfb007b..26e9c4795b 100644 --- a/cli/tests/pcluster/configure/test_pcluster_configure/test_no_input_no_automation_no_errors_with_config_file/output.txt +++ b/cli/tests/pcluster/configure/test_pcluster_configure/test_no_input_no_automation_no_errors_with_config_file/output.txt @@ -45,12 +45,12 @@ Allowed values for VPC ID: 2 vpc-23456789 ParallelClusterVPC-20190624105051 0 3 vpc-34567891 default 3 4 vpc-45678912 ParallelClusterVPC-20190626095403 1 -Allowed values for Master Subnet ID: +Allowed values for head node Subnet ID: # id name size availability_zone --- --------------- ---------------------------- ------ ------------------- 1 subnet-12345678 ParallelClusterPublicSubnet 256 eu-west-1b 2 subnet-23456789 ParallelClusterPrivateSubnet 4096 eu-west-1b -Allowed values for Compute Subnet ID: +Allowed values for compute Subnet ID: # id name size availability_zone --- --------------- ---------------------------- ------ ------------------- 1 subnet-12345678 ParallelClusterPublicSubnet 256 eu-west-1b diff --git a/cli/tests/pcluster/configure/test_pcluster_configure/test_region_env_overwrite_region_config/output.txt b/cli/tests/pcluster/configure/test_pcluster_configure/test_region_env_overwrite_region_config/output.txt index 0c5dfb007b..26e9c4795b 100644 --- a/cli/tests/pcluster/configure/test_pcluster_configure/test_region_env_overwrite_region_config/output.txt +++ b/cli/tests/pcluster/configure/test_pcluster_configure/test_region_env_overwrite_region_config/output.txt @@ -45,12 +45,12 @@ Allowed values for VPC ID: 2 vpc-23456789 ParallelClusterVPC-20190624105051 0 3 vpc-34567891 default 3 4 vpc-45678912 ParallelClusterVPC-20190626095403 1 -Allowed values for Master Subnet ID: +Allowed values for head node Subnet ID: # id name size availability_zone --- --------------- ---------------------------- ------ ------------------- 1 subnet-12345678 ParallelClusterPublicSubnet 256 eu-west-1b 2 subnet-23456789 ParallelClusterPrivateSubnet 4096 eu-west-1b -Allowed values for Compute Subnet ID: +Allowed values for compute Subnet ID: # id name size availability_zone --- --------------- ---------------------------- ------ ------------------- 1 subnet-12345678 ParallelClusterPublicSubnet 256 eu-west-1b diff --git a/cli/tests/pcluster/configure/test_pcluster_configure/test_subnet_automation_no_awsbatch_no_errors/output.txt b/cli/tests/pcluster/configure/test_pcluster_configure/test_subnet_automation_no_awsbatch_no_errors/output.txt index bc9aff4e37..6598ea03cd 100644 --- a/cli/tests/pcluster/configure/test_pcluster_configure/test_subnet_automation_no_awsbatch_no_errors/output.txt +++ b/cli/tests/pcluster/configure/test_pcluster_configure/test_subnet_automation_no_awsbatch_no_errors/output.txt @@ -46,7 +46,7 @@ Allowed values for VPC ID: 3 vpc-34567891 default 3 4 vpc-45678912 ParallelClusterVPC-20190626095403 1 Allowed values for Network Configuration: -1. Master in a public subnet and compute fleet in a private subnet -2. Master and compute fleet in the same public subnet +1. Head node in a public subnet and compute fleet in a private subnet +2. Head node and compute fleet in the same public subnet Configuration file written to {{ CONFIG_FILE }} You can edit your configuration file or simply run 'pcluster create -c {{ CONFIG_FILE }} cluster-name' to create your cluster diff --git a/cli/tests/pcluster/configure/test_pcluster_configure/test_subnet_automation_no_awsbatch_no_errors_empty_vpc/output.txt b/cli/tests/pcluster/configure/test_pcluster_configure/test_subnet_automation_no_awsbatch_no_errors_empty_vpc/output.txt index 17e549ead9..e21739c6aa 100644 --- a/cli/tests/pcluster/configure/test_pcluster_configure/test_subnet_automation_no_awsbatch_no_errors_empty_vpc/output.txt +++ b/cli/tests/pcluster/configure/test_pcluster_configure/test_subnet_automation_no_awsbatch_no_errors_empty_vpc/output.txt @@ -47,7 +47,7 @@ Allowed values for VPC ID: 4 vpc-45678912 ParallelClusterVPC-20190626095403 1 There are no qualified subnets. Starting automatic creation of subnets... Allowed values for Network Configuration: -1. Master in a public subnet and compute fleet in a private subnet -2. Master and compute fleet in the same public subnet +1. Head node in a public subnet and compute fleet in a private subnet +2. Head node and compute fleet in the same public subnet Configuration file written to {{ CONFIG_FILE }} You can edit your configuration file or simply run 'pcluster create -c {{ CONFIG_FILE }} cluster-name' to create your cluster diff --git a/cli/tests/pcluster/configure/test_pcluster_configure/test_subnet_automation_no_awsbatch_no_errors_with_config_file/output.txt b/cli/tests/pcluster/configure/test_pcluster_configure/test_subnet_automation_no_awsbatch_no_errors_with_config_file/output.txt index 36cef0e6cb..a5e9fafbdf 100644 --- a/cli/tests/pcluster/configure/test_pcluster_configure/test_subnet_automation_no_awsbatch_no_errors_with_config_file/output.txt +++ b/cli/tests/pcluster/configure/test_pcluster_configure/test_subnet_automation_no_awsbatch_no_errors_with_config_file/output.txt @@ -46,7 +46,7 @@ Allowed values for VPC ID: 3 vpc-34567891 default 3 4 vpc-45678912 ParallelClusterVPC-20190626095403 1 Allowed values for Network Configuration: -1. Master in a public subnet and compute fleet in a private subnet -2. Master and compute fleet in the same public subnet +1. Head node in a public subnet and compute fleet in a private subnet +2. Head node and compute fleet in the same public subnet Configuration file written to {{ CONFIG_FILE }} You can edit your configuration file or simply run 'pcluster create -c {{ CONFIG_FILE }} cluster-name' to create your cluster diff --git a/cli/tests/pcluster/configure/test_pcluster_configure/test_unexisting_instance_type/output.txt b/cli/tests/pcluster/configure/test_pcluster_configure/test_unexisting_instance_type/output.txt index 74396f7b89..24e5885c62 100644 --- a/cli/tests/pcluster/configure/test_pcluster_configure/test_unexisting_instance_type/output.txt +++ b/cli/tests/pcluster/configure/test_pcluster_configure/test_unexisting_instance_type/output.txt @@ -45,13 +45,13 @@ Allowed values for VPC ID: 2 vpc-23456789 ParallelClusterVPC-20190624105051 0 3 vpc-34567891 default 3 4 vpc-45678912 ParallelClusterVPC-20190626095403 1 -Allowed values for Master Subnet ID: +Allowed values for head node Subnet ID: # id name size availability_zone --- --------------- ------ ------ ------------------- 1 subnet-34567891 4096 eu-west-1b 2 subnet-45678912 4096 eu-west-1a 3 subnet-56789123 4096 eu-west-1c -Allowed values for Compute Subnet ID: +Allowed values for compute Subnet ID: # id name size availability_zone --- --------------- ------ ------ ------------------- 1 subnet-34567891 4096 eu-west-1b diff --git a/cli/tests/pcluster/configure/test_pcluster_configure/test_vpc_automation_no_awsbatch_no_errors/output.txt b/cli/tests/pcluster/configure/test_pcluster_configure/test_vpc_automation_no_awsbatch_no_errors/output.txt index 913654dce4..10ef27030c 100644 --- a/cli/tests/pcluster/configure/test_pcluster_configure/test_vpc_automation_no_awsbatch_no_errors/output.txt +++ b/cli/tests/pcluster/configure/test_pcluster_configure/test_vpc_automation_no_awsbatch_no_errors/output.txt @@ -39,8 +39,8 @@ Allowed values for Operating System: 5. ubuntu1604 6. ubuntu1804 Allowed values for Network Configuration: -1. Master in a public subnet and compute fleet in a private subnet -2. Master and compute fleet in the same public subnet +1. Head node in a public subnet and compute fleet in a private subnet +2. Head node and compute fleet in the same public subnet Beginning VPC creation. Please do not leave the terminal until the creation is finalized Configuration file written to {{ CONFIG_FILE }} You can edit your configuration file or simply run 'pcluster create -c {{ CONFIG_FILE }} cluster-name' to create your cluster diff --git a/cli/tests/pcluster/configure/test_pcluster_configure/test_vpc_automation_no_vpc_in_region/output.txt b/cli/tests/pcluster/configure/test_pcluster_configure/test_vpc_automation_no_vpc_in_region/output.txt index 2c506db741..642de60f6b 100644 --- a/cli/tests/pcluster/configure/test_pcluster_configure/test_vpc_automation_no_vpc_in_region/output.txt +++ b/cli/tests/pcluster/configure/test_pcluster_configure/test_vpc_automation_no_vpc_in_region/output.txt @@ -40,8 +40,8 @@ Allowed values for Operating System: 6. ubuntu1804 There are no VPC for the given region. Starting automatic creation of VPC and subnets... Allowed values for Network Configuration: -1. Master in a public subnet and compute fleet in a private subnet -2. Master and compute fleet in the same public subnet +1. Head node in a public subnet and compute fleet in a private subnet +2. Head node and compute fleet in the same public subnet Beginning VPC creation. Please do not leave the terminal until the creation is finalized Configuration file written to {{ CONFIG_FILE }} You can edit your configuration file or simply run 'pcluster create -c {{ CONFIG_FILE }} cluster-name' to create your cluster diff --git a/cli/tests/pcluster/configure/test_pcluster_configure/test_vpc_automation_no_vpc_in_region_public/output.txt b/cli/tests/pcluster/configure/test_pcluster_configure/test_vpc_automation_no_vpc_in_region_public/output.txt index 2c506db741..642de60f6b 100644 --- a/cli/tests/pcluster/configure/test_pcluster_configure/test_vpc_automation_no_vpc_in_region_public/output.txt +++ b/cli/tests/pcluster/configure/test_pcluster_configure/test_vpc_automation_no_vpc_in_region_public/output.txt @@ -40,8 +40,8 @@ Allowed values for Operating System: 6. ubuntu1804 There are no VPC for the given region. Starting automatic creation of VPC and subnets... Allowed values for Network Configuration: -1. Master in a public subnet and compute fleet in a private subnet -2. Master and compute fleet in the same public subnet +1. Head node in a public subnet and compute fleet in a private subnet +2. Head node and compute fleet in the same public subnet Beginning VPC creation. Please do not leave the terminal until the creation is finalized Configuration file written to {{ CONFIG_FILE }} You can edit your configuration file or simply run 'pcluster create -c {{ CONFIG_FILE }} cluster-name' to create your cluster diff --git a/cli/tests/pcluster/configure/test_pcluster_configure/test_with_input_no_automation_no_errors_with_config_file/output.txt b/cli/tests/pcluster/configure/test_pcluster_configure/test_with_input_no_automation_no_errors_with_config_file/output.txt index 74396f7b89..24e5885c62 100644 --- a/cli/tests/pcluster/configure/test_pcluster_configure/test_with_input_no_automation_no_errors_with_config_file/output.txt +++ b/cli/tests/pcluster/configure/test_pcluster_configure/test_with_input_no_automation_no_errors_with_config_file/output.txt @@ -45,13 +45,13 @@ Allowed values for VPC ID: 2 vpc-23456789 ParallelClusterVPC-20190624105051 0 3 vpc-34567891 default 3 4 vpc-45678912 ParallelClusterVPC-20190626095403 1 -Allowed values for Master Subnet ID: +Allowed values for head node Subnet ID: # id name size availability_zone --- --------------- ------ ------ ------------------- 1 subnet-34567891 4096 eu-west-1b 2 subnet-45678912 4096 eu-west-1a 3 subnet-56789123 4096 eu-west-1c -Allowed values for Compute Subnet ID: +Allowed values for compute Subnet ID: # id name size availability_zone --- --------------- ------ ------ ------------------- 1 subnet-34567891 4096 eu-west-1b diff --git a/cli/tests/pcluster/configure/test_pcluster_configure/test_with_region_arg_with_config_file/output.txt b/cli/tests/pcluster/configure/test_pcluster_configure/test_with_region_arg_with_config_file/output.txt index 8e1ad7ce2c..8579f9e437 100644 --- a/cli/tests/pcluster/configure/test_pcluster_configure/test_with_region_arg_with_config_file/output.txt +++ b/cli/tests/pcluster/configure/test_pcluster_configure/test_with_region_arg_with_config_file/output.txt @@ -28,12 +28,12 @@ Allowed values for VPC ID: 2 vpc-23456789 ParallelClusterVPC-20190624105051 0 3 vpc-34567891 default 3 4 vpc-45678912 ParallelClusterVPC-20190626095403 1 -Allowed values for Master Subnet ID: +Allowed values for head node Subnet ID: # id name size availability_zone --- --------------- ---------------------------- ------ ------------------- 1 subnet-12345678 ParallelClusterPublicSubnet 256 eu-west-1b 2 subnet-23456789 ParallelClusterPrivateSubnet 4096 eu-west-1b -Allowed values for Compute Subnet ID: +Allowed values for compute Subnet ID: # id name size availability_zone --- --------------- ---------------------------- ------ ------------------- 1 subnet-12345678 ParallelClusterPublicSubnet 256 eu-west-1b diff --git a/tests/integration-tests/tests/configure/test_pcluster_configure.py b/tests/integration-tests/tests/configure/test_pcluster_configure.py index 6eef0a28eb..0fa7bd180c 100644 --- a/tests/integration-tests/tests/configure/test_pcluster_configure.py +++ b/tests/integration-tests/tests/configure/test_pcluster_configure.py @@ -220,17 +220,17 @@ def orchestrate_pcluster_configure_stages( {"prompt": r"Operating System \[alinux2\]: ", "response": os, "skip_for_batch": True}, {"prompt": fr"Minimum cluster size \({compute_units}\) \[0\]: ", "response": "1"}, {"prompt": fr"Maximum cluster size \({compute_units}\) \[10\]: ", "response": ""}, - {"prompt": r"Master instance type \[t2\.micro\]: ", "response": instance}, + {"prompt": r"Head node instance type \[t2\.micro\]: ", "response": instance}, {"prompt": r"Compute instance type \[t2\.micro\]: ", "response": instance, "skip_for_batch": True}, {"prompt": r"Automate VPC creation\? \(y/n\) \[n\]: ", "response": "n"}, {"prompt": r"VPC ID \[vpc-.+\]: ", "response": vpc_id}, {"prompt": r"Automate Subnet creation\? \(y/n\) \[y\]: ", "response": "n"}, { - "prompt": fr"{omitted_note}Master Subnet ID \[subnet-.+\]: ", + "prompt": fr"{omitted_note}head node Subnet ID \[subnet-.+\]: ", "response": headnode_subnet_id, }, { - "prompt": fr"{omitted_note}Compute Subnet ID \[{default_compute_subnet}\]: ", + "prompt": fr"{omitted_note}compute Subnet ID \[{default_compute_subnet}\]: ", "response": compute_subnet_id, }, ] diff --git a/tests/integration-tests/tests/disable_hyperthreading/test_disable_hyperthreading.py b/tests/integration-tests/tests/disable_hyperthreading/test_disable_hyperthreading.py index 603fe401d2..4f34f62640 100644 --- a/tests/integration-tests/tests/disable_hyperthreading/test_disable_hyperthreading.py +++ b/tests/integration-tests/tests/disable_hyperthreading/test_disable_hyperthreading.py @@ -85,8 +85,8 @@ def _test_disable_hyperthreading_settings( expected_cpus_per_instance = slots_per_instance // 2 if hyperthreading_disabled else slots_per_instance expected_threads_per_core = 1 if hyperthreading_disabled else 2 - # Test disable hyperthreading on Master - logging.info("Test Disable Hyperthreading on Master") + # Test disable hyperthreading on head node + logging.info("Test Disable Hyperthreading on head node") result = remote_command_executor.run_remote_command("lscpu") if partition: # If partition is supplied, assume this is HIT setting where ht settings are at the queue level From c882a5aa1ad7078f09748c0612a32717ca9d96f1 Mon Sep 17 00:00:00 2001 From: Enrico Usai Date: Mon, 30 Nov 2020 12:07:56 +0100 Subject: [PATCH 21/66] Rename from master_* to head_node_* in CFN descriptions Signed-off-by: Enrico Usai --- cloudformation/aws-parallelcluster.cfn.json | 20 +++++++++---------- cloudformation/cw-dashboard-substack.cfn.yaml | 4 ++-- cloudformation/efs-substack.cfn.json | 2 +- .../master-server-substack.cfn.yaml | 8 ++++---- 4 files changed, 17 insertions(+), 17 deletions(-) diff --git a/cloudformation/aws-parallelcluster.cfn.json b/cloudformation/aws-parallelcluster.cfn.json index 332e6f7f2b..61f9f5dd34 100644 --- a/cloudformation/aws-parallelcluster.cfn.json +++ b/cloudformation/aws-parallelcluster.cfn.json @@ -7,7 +7,7 @@ "Type": "AWS::EC2::KeyPair::KeyName" }, "MasterInstanceType": { - "Description": "MasterServer EC2 instance type", + "Description": "Head node EC2 instance type", "Type": "String", "Default": "t2.micro", "ConstraintDescription": "Must be a valid EC2 instance type, with support for HVM." @@ -72,7 +72,7 @@ "AllowedPattern": "^(NONE|standard|io1|io2|gp2|st1|sc1)((,|, )(NONE|standard|io1|io2|gp2|st1|sc1)){4}$" }, "MasterSubnetId": { - "Description": "ID of the Subnet you want to provision the Master server into", + "Description": "ID of the Subnet you want to provision the head node into", "Type": "AWS::EC2::Subnet::Id" }, "AvailabilityZone": { @@ -265,7 +265,7 @@ "Default": "NONE,NONE,NONE,NONE,NONE" }, "MasterRootVolumeSize": { - "Description": "Size of MasterServer EBS root volume in GB", + "Description": "Size of head node EBS root volume in GB", "Type": "Number", "Default": "25", "MinValue": "25" @@ -337,7 +337,7 @@ "Default": "NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE" }, "Cores": { - "Description": "Comma seperated string of [master cores], [compute cores], [master instance type supports disabling hyperthreading via CPU options], [compute instance type supports disabling hyperthreading via CPU options].", + "Description": "Comma seperated string of [head node cores], [compute cores], [head node instance type supports disabling hyperthreading via CPU options], [compute instance type supports disabling hyperthreading via CPU options].", "Type": "CommaDelimitedList", "Default": "NONE,NONE,NONE,NONE" }, @@ -370,7 +370,7 @@ "Default": "true,14" }, "NetworkInterfacesCount": { - "Description": "Comma separated string of [master network interfaces], [compute network interfaces].", + "Description": "Comma separated string of [head node network interfaces], [compute network interfaces].", "Type": "CommaDelimitedList", "Default": "1,1" } @@ -1935,7 +1935,7 @@ "MasterSecurityGroup": { "Type": "AWS::EC2::SecurityGroup", "Properties": { - "GroupDescription": "Enable access to the Master host", + "GroupDescription": "Enable access to the head node", "VpcId": { "Ref": "VPCId" }, @@ -2084,7 +2084,7 @@ "MasterENI": { "Type": "AWS::EC2::NetworkInterface", "Properties": { - "Description": "AWS ParallelCluster Master Server", + "Description": "AWS ParallelCluster head node interface", "SubnetId": { "Ref": "MasterSubnetId" }, @@ -4093,7 +4093,7 @@ }, "Outputs": { "ClusterUser": { - "Description": "Username to login to Master host", + "Description": "Username to login to head node", "Value": { "Fn::FindInMap": [ "OSFeatures", @@ -4105,7 +4105,7 @@ } }, "MasterPrivateIP": { - "Description": "Private IP Address of the Master host", + "Description": "Private IP Address of the head node", "Value": { "Fn::GetAtt": [ "MasterServerSubstack", @@ -4114,7 +4114,7 @@ } }, "MasterPublicIP": { - "Description": "Public IP Address of the Master host", + "Description": "Public IP Address of the head node", "Value": { "Fn::GetAtt": [ "MasterServerSubstack", diff --git a/cloudformation/cw-dashboard-substack.cfn.yaml b/cloudformation/cw-dashboard-substack.cfn.yaml index e600e5d075..04d7a9309f 100644 --- a/cloudformation/cw-dashboard-substack.cfn.yaml +++ b/cloudformation/cw-dashboard-substack.cfn.yaml @@ -53,10 +53,10 @@ Parameters: Type: String {#- Head Node parameters #} MasterInstanceId: - Description: ID of the Master instance + Description: ID of the head node instance Type: AWS::EC2::Instance::Id MasterPrivateIP: - Description: Private IP of the Master instance + Description: Private IP of the head node instance Type: String {#- EBS parameters #} EBSVolumesIds: diff --git a/cloudformation/efs-substack.cfn.json b/cloudformation/efs-substack.cfn.json index 57c8f0f695..8f00c106a5 100644 --- a/cloudformation/efs-substack.cfn.json +++ b/cloudformation/efs-substack.cfn.json @@ -247,7 +247,7 @@ "Type": "CommaDelimitedList" }, "MasterSubnetId": { - "Description": "Master subnet id for master mount target", + "Description": "Head node subnet id for head node mount target", "Type": "String" } }, diff --git a/cloudformation/master-server-substack.cfn.yaml b/cloudformation/master-server-substack.cfn.yaml index 3ef897b1b7..714e84782c 100644 --- a/cloudformation/master-server-substack.cfn.yaml +++ b/cloudformation/master-server-substack.cfn.yaml @@ -707,17 +707,17 @@ Resources: Condition: HasUpdateWaiterFunction Outputs: MasterInstanceID: - Description: ID of the Master instance + Description: ID of the head node instance Value: !Ref 'MasterServer' MasterPrivateIP: - Description: Private IP Address of the Master host + Description: Private IP Address of the head node Value: !GetAtt 'MasterServer.PrivateIp' MasterPublicIP: - Description: Public IP Address of the Master host + Description: Public IP Address of the head node Value: !GetAtt 'MasterServer.PublicIp' Condition: HasMasterPublicIp MasterPrivateDnsName: - Description: Private DNS name of the Master host + Description: Private DNS name of the head node Value: !GetAtt 'MasterServer.PrivateDnsName' Metadata: DependsOnCustomResources: !Ref 'DependsOnCustomResources' From c96d01bb42f5a38f55ef19eec621e69a0d190591 Mon Sep 17 00:00:00 2001 From: Enrico Usai Date: Mon, 30 Nov 2020 12:17:59 +0100 Subject: [PATCH 22/66] Rename update recipe from update_master to update_head_node Signed-off-by: Enrico Usai --- cloudformation/master-server-substack.cfn.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cloudformation/master-server-substack.cfn.yaml b/cloudformation/master-server-substack.cfn.yaml index 714e84782c..eaecadd4c7 100644 --- a/cloudformation/master-server-substack.cfn.yaml +++ b/cloudformation/master-server-substack.cfn.yaml @@ -696,7 +696,7 @@ Resources: command: chef-client --local-mode --config /etc/chef/client.rb --log_level info --logfile /var/log/chef-client.log --force-formatter --no-color --chef-zero-port 8889 --json-attributes /etc/chef/dna.json --override-runlist - aws-parallelcluster::update_master + aws-parallelcluster::update_head_node cwd: /etc/chef UpdateWaiterCustomResource: Type: AWS::CloudFormation::CustomResource From 2dc01d08a97ffd0e3d4ae4aa20a03592115c9937 Mon Sep 17 00:00:00 2001 From: Hanwen <68928867+hanwen-pcluste@users.noreply.github.com> Date: Wed, 2 Dec 2020 10:09:47 -0500 Subject: [PATCH 23/66] Set default instance type to free tier instance type (#2254) When running pcluster in a region with free tier, default instance type is set to the free tier instance type. When running pcluster in the China (BJS) region or AWS GovCloud (US) regions, default instance type is t3.micro. Free tier is not available in the China (BJS) region and AWS GovCloud (US) regions. For more information about free tier, please see https://aws.amazon.com/free/free-tier-faqs/ Signed-off-by: Hanwen --- CHANGELOG.md | 3 + cli/src/pcluster/config/cfn_param_types.py | 29 +++++++++ cli/src/pcluster/config/mappings.py | 24 ++++---- cli/src/pcluster/config/param_types.py | 3 +- cli/src/pcluster/configure/easyconfig.py | 13 +++- cli/src/pcluster/createami.py | 9 --- cli/src/pcluster/examples/config | 4 +- cli/src/pcluster/utils.py | 20 +++++++ cli/tests/conftest.py | 14 +++++ cli/tests/pcluster/config/defaults.py | 6 +- .../pcluster/config/test_section_cluster.py | 12 +++- cli/tests/pcluster/config/test_utils.py | 59 +++++++++++++++++++ .../configure/test_pcluster_configure.py | 1 + .../test_bad_config_file/pcluster.config.ini | 3 +- .../pcluster.config.ini | 1 + .../pcluster.config.ini | 3 +- .../pcluster.config.ini | 2 + .../pcluster.config.ini | 3 +- .../pcluster.config.ini | 3 +- .../pcluster.config.ini | 3 +- .../pcluster.config.ini | 3 +- .../pcluster.config.ini | 3 +- .../pcluster.config.ini | 3 +- .../pcluster.config.ini | 1 + .../test_slurm_sit_full/expected_output.ini | 2 +- .../test_slurm_sit_simple/expected_output.ini | 2 +- .../expected_output.ini | 2 +- cloudformation/aws-parallelcluster.cfn.json | 2 - .../configs/common/common.yaml | 5 ++ .../configure/test_pcluster_configure.py | 45 ++++++++++++-- 30 files changed, 224 insertions(+), 59 deletions(-) create mode 100644 cli/tests/pcluster/config/test_utils.py diff --git a/CHANGELOG.md b/CHANGELOG.md index dd17a5f52a..1c1e25cb86 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,6 +15,9 @@ CHANGELOG - Pull Amazon Linux Docker images from ECR when building docker image for `awsbatch` scheduler. This only applies to images built for `x86` architecture. - Use inclusive language in user facing messages and internal naming convention. +- Change the default of instance types from the hardcoded `t2.micro` to the free tier instance type + (`t2.micro` or `t3.micro` dependent on region). In regions without free tier, the default is `t3.micro`. + **BUG FIXES** diff --git a/cli/src/pcluster/config/cfn_param_types.py b/cli/src/pcluster/config/cfn_param_types.py index 3c7bf976c9..dfdb7f3816 100644 --- a/cli/src/pcluster/config/cfn_param_types.py +++ b/cli/src/pcluster/config/cfn_param_types.py @@ -24,6 +24,7 @@ error, get_availability_zone_of_subnet, get_cfn_param, + get_default_instance_type, get_ebs_snapshot_info, get_efs_mount_target_id, get_file_section_name, @@ -854,6 +855,34 @@ def to_cfn(self): return cfn_params +class ComputeInstanceTypeCfnParam(CfnParam): + """ + Class to manage the compute instance type parameter. + + We need this class in order to set the default instance type from a boto3 call. + """ + + def refresh(self): + """Get default value from a boto3 call for free tier instance type.""" + if not self.value: + scheduler = self.pcluster_config.get_section("cluster").get_param_value("scheduler") + if scheduler: + self.value = "optimal" if scheduler == "awsbatch" else get_default_instance_type() + + +class HeadNodeInstanceTypeCfnParam(CfnParam): + """ + Class to manage the head node instance type parameter. + + We need this class in order to set the default instance type from a boto3 call. + """ + + def refresh(self): + """Get default value from a boto3 call for free tier instance type.""" + if not self.value: + self.value = get_default_instance_type() + + class TagsParam(JsonCfnParam): """ Class to manage the tags json configuration parameter. diff --git a/cli/src/pcluster/config/mappings.py b/cli/src/pcluster/config/mappings.py index 2cf7221d79..768ec3255b 100644 --- a/cli/src/pcluster/config/mappings.py +++ b/cli/src/pcluster/config/mappings.py @@ -21,12 +21,14 @@ ClusterCfnSection, ClusterConfigMetadataCfnParam, ComputeAvailabilityZoneCfnParam, + ComputeInstanceTypeCfnParam, DisableHyperThreadingCfnParam, EBSSettingsCfnParam, EFSCfnSection, ExtraJsonCfnParam, FloatCfnParam, HeadNodeAvailabilityZoneCfnParam, + HeadNodeInstanceTypeCfnParam, IntCfnParam, MaintainInitialSizeCfnParam, NetworkInterfacesCountCfnParam, @@ -753,14 +755,6 @@ "validators": [ec2_key_pair_validator], "update_policy": UpdatePolicy.UNSUPPORTED }), - ("base_os", { - "type": BaseOSCfnParam, - "cfn_param_mapping": "BaseOS", - "allowed_values": ["alinux", "alinux2", "ubuntu1604", "ubuntu1804", "centos7", "centos8"], - "validators": [base_os_validator, architecture_os_validator], - "required": True, - "update_policy": UpdatePolicy.UNSUPPORTED - }), ("scheduler", { "cfn_param_mapping": "Scheduler", "allowed_values": ["awsbatch", "sge", "slurm", "torque"], @@ -770,7 +764,7 @@ }), # Head node ("master_instance_type", { - "default": "t2.micro", + "type": HeadNodeInstanceTypeCfnParam, "cfn_param_mapping": "MasterInstanceType", "validators": [head_node_instance_type_validator, ec2_instance_type_validator], "update_policy": UpdatePolicy.UNSUPPORTED, @@ -786,6 +780,14 @@ action_needed=UpdatePolicy.ACTIONS_NEEDED["ebs_volume_update"] ) }), + ("base_os", { + "type": BaseOSCfnParam, + "cfn_param_mapping": "BaseOS", + "allowed_values": ["alinux", "alinux2", "ubuntu1604", "ubuntu1804", "centos7", "centos8"], + "validators": [base_os_validator, architecture_os_validator], + "required": True, + "update_policy": UpdatePolicy.UNSUPPORTED + }), # Compute fleet ("compute_root_volume_size", { "type": IntCfnParam, @@ -1024,9 +1026,7 @@ }), # Compute fleet ("compute_instance_type", { - "default": - lambda section: - "optimal" if section and section.get_param_value("scheduler") == "awsbatch" else "t2.micro", + "type": ComputeInstanceTypeCfnParam, "cfn_param_mapping": "ComputeInstanceType", "validators": [compute_instance_type_validator, instances_architecture_compatibility_validator], "update_policy": UpdatePolicy.COMPUTE_FLEET_STOP diff --git a/cli/src/pcluster/config/param_types.py b/cli/src/pcluster/config/param_types.py index ce1e425f34..f6db94ad2b 100644 --- a/cli/src/pcluster/config/param_types.py +++ b/cli/src/pcluster/config/param_types.py @@ -13,6 +13,7 @@ import re import sys from abc import abstractmethod +from collections import OrderedDict from enum import Enum from configparser import NoSectionError @@ -441,7 +442,7 @@ def __init__(self, section_definition, pcluster_config, section_label=None, pare self.parent_section = parent_section # initialize section parameters with default values - self.params = {} + self.params = OrderedDict({}) self._from_definition() @property diff --git a/cli/src/pcluster/configure/easyconfig.py b/cli/src/pcluster/configure/easyconfig.py index ac707c3120..18720ecd6f 100644 --- a/cli/src/pcluster/configure/easyconfig.py +++ b/cli/src/pcluster/configure/easyconfig.py @@ -34,6 +34,7 @@ from pcluster.configure.utils import get_regions, get_resource_tag, handle_client_exception, prompt, prompt_iterable from pcluster.utils import ( error, + get_default_instance_type, get_region, get_supported_az_for_multi_instance_types, get_supported_az_for_one_instance_type, @@ -122,7 +123,7 @@ def configure(args): if pcluster_config.cluster_model == ClusterModel.HIT: error( "Configuration in file {0} cannot be overwritten. Please specify a different file path".format( - args.config_file + pcluster_config.config_file ) ) @@ -367,16 +368,22 @@ def prompt_os(self): def prompt_instance_types(self): """Ask for head_node_instance_type and compute_instance_type (if necessary).""" + default_head_node_instance_type = self.cluster_section.get_param_value("master_instance_type") + if not default_head_node_instance_type: + default_head_node_instance_type = get_default_instance_type() self.head_node_instance_type = prompt( "Head node instance type", lambda x: _is_instance_type_supported_for_head_node(x) and x in get_supported_instance_types(), - default_value=self.cluster_section.get_param_value("master_instance_type"), + default_value=default_head_node_instance_type, ) if not self.is_aws_batch: + default_compute_instance_type = self.cluster_section.get_param_value("compute_instance_type") + if not default_compute_instance_type: + default_compute_instance_type = get_default_instance_type() self.compute_instance_type = prompt( "Compute instance type", lambda x: x in get_supported_compute_instance_types(self.scheduler), - default_value=self.cluster_section.get_param_value("compute_instance_type"), + default_value=default_compute_instance_type, ) # Cache availability zones offering the selected instance type(s) for later use self.cache_qualified_az() diff --git a/cli/src/pcluster/createami.py b/cli/src/pcluster/createami.py index 3d034fd7c6..b0e3e8cf18 100644 --- a/cli/src/pcluster/createami.py +++ b/cli/src/pcluster/createami.py @@ -358,12 +358,3 @@ def create_ami(args): _print_create_ami_results(results) if "tmp_dir" in locals() and tmp_dir: rmtree(tmp_dir) - - -def _get_default_template_url(region): - return ( - "https://{REGION}-aws-parallelcluster.s3.{REGION}.amazonaws.com{SUFFIX}/templates/" - "aws-parallelcluster-{VERSION}.cfn.json".format( - REGION=region, SUFFIX=".cn" if region.startswith("cn") else "", VERSION=utils.get_installed_version() - ) - ) diff --git a/cli/src/pcluster/examples/config b/cli/src/pcluster/examples/config index 4179a54f56..f3494474ba 100644 --- a/cli/src/pcluster/examples/config +++ b/cli/src/pcluster/examples/config @@ -24,10 +24,10 @@ key_name = mykey # (defaults to https://s3.amazonaws.com/-aws-parallelcluster/templates/aws-parallelcluster-.cfn.json) #template_url = https://s3.amazonaws.com/-aws-parallelcluster/templates/aws-parallelcluster-.cfn.json # EC2 instance type for head node -# (defaults to t2.micro) +# (defaults to the free tier instance type of the region. If the region does not have free tier, default to t3.micro) #master_instance_type = t2.micro # EC2 instance type for compute nodes -# (defaults to t2.micro , 'optimal' when scheduler is awsbatch) +# (defaults to the free tier instance type of the region, 'optimal' when scheduler is awsbatch) #compute_instance_type = t2.micro # Initial number of EC2 instances to launch as compute nodes in the cluster for schedulers other than awsbatch. # (defaults to 2) diff --git a/cli/src/pcluster/utils.py b/cli/src/pcluster/utils.py index 62d7ba49cc..00844e2fb2 100644 --- a/cli/src/pcluster/utils.py +++ b/cli/src/pcluster/utils.py @@ -1170,6 +1170,26 @@ def get_ebs_snapshot_info(ebs_snapshot_id, raise_exceptions=False): ) +def get_default_instance_type(): + """If current region support free tier, return the free tier instance type. Otherwise, return t3.micro .""" + if not hasattr(get_default_instance_type, "cache"): + get_default_instance_type.cache = {} + cache = get_default_instance_type.cache + region = os.environ.get("AWS_DEFAULT_REGION") + if region not in cache: + free_tier_instance_type = [] + for page in paginate_boto3( + boto3.client("ec2").describe_instance_types, + Filters=[ + {"Name": "free-tier-eligible", "Values": ["true"]}, + {"Name": "current-generation", "Values": ["true"]}, + ], + ): + free_tier_instance_type.append(page) + cache[region] = free_tier_instance_type[0]["InstanceType"] if free_tier_instance_type else "t3.micro" + return cache[region] + + class Cache: """Simple utility class providing a cache mechanism for expensive functions.""" diff --git a/cli/tests/conftest.py b/cli/tests/conftest.py index e1c4fbc8f3..0cad54ed55 100644 --- a/cli/tests/conftest.py +++ b/cli/tests/conftest.py @@ -19,6 +19,20 @@ def clear_env(): del os.environ["AWS_DEFAULT_REGION"] +@pytest.fixture(autouse=True) +def mock_default_instance(mocker, request): + """ + Mock get_default_instance_type for all tests. + + To disable the mock for certain tests, add annotation `@pytest.mark.nomockdefaultinstance` to the tests. + To disable the mock for an entire file, declare global var `pytestmark = pytest.mark.noassertnopendingresponses` + """ + if "nomockdefaultinstance" in request.keywords: + # skip mocking + return + mocker.patch("pcluster.config.cfn_param_types.get_default_instance_type", return_value="t2.micro") + + @pytest.fixture def failed_with_message(capsys): """Assert that the command exited with a specific error message.""" diff --git a/cli/tests/pcluster/config/defaults.py b/cli/tests/pcluster/config/defaults.py index 33e8bab722..73d0e4cae4 100644 --- a/cli/tests/pcluster/config/defaults.py +++ b/cli/tests/pcluster/config/defaults.py @@ -99,9 +99,9 @@ "shared_dir": "/shared", "placement_group": None, "placement": "compute", - "master_instance_type": "t2.micro", + "master_instance_type": None, "master_root_volume_size": 25, - "compute_instance_type": "t2.micro", + "compute_instance_type": None, "compute_root_volume_size": 25, "initial_queue_size": 0, "max_queue_size": 10, @@ -155,7 +155,7 @@ "base_os": None, # base_os does not have a default, but this is here to make testing easier "scheduler": None, # The cluster does not have a default, but this is here to make testing easier "shared_dir": "/shared", - "master_instance_type": "t2.micro", + "master_instance_type": None, "master_root_volume_size": 25, "compute_root_volume_size": 25, "proxy_server": None, diff --git a/cli/tests/pcluster/config/test_section_cluster.py b/cli/tests/pcluster/config/test_section_cluster.py index c94840063c..b6f293767c 100644 --- a/cli/tests/pcluster/config/test_section_cluster.py +++ b/cli/tests/pcluster/config/test_section_cluster.py @@ -38,6 +38,8 @@ "base_os": "alinux2", "scheduler": "slurm", "cluster_config_metadata": {"sections": {"cluster": ["custom_cluster_label"]}}, + "master_instance_type": "t2.micro", + "compute_instance_type": "t2.micro", }, ), "custom_cluster_label", @@ -50,6 +52,8 @@ "additional_iam_policies": ["arn:aws:iam::aws:policy/CloudWatchAgentServerPolicy"], "base_os": "alinux2", "scheduler": "slurm", + "master_instance_type": "t2.micro", + "compute_instance_type": "t2.micro", }, ), "default", @@ -86,6 +90,8 @@ "arn:aws:iam::aws:policy/AWSBatchFullAccess", "arn:aws:iam::aws:policy/CloudWatchAgentServerPolicy", ], + "master_instance_type": "t2.micro", + "compute_instance_type": "t2.micro", }, ), "default", @@ -267,7 +273,6 @@ def test_hit_cluster_section_from_file(mocker, config_parser_dict, expected_dict ("placement", "cluster", "cluster", None), # Head node # TODO add regex for master_instance_type - ("master_instance_type", None, "t2.micro", None), ("master_instance_type", "", "", None), ("master_instance_type", "test", "test", None), ("master_instance_type", "NONE", "NONE", None), @@ -281,7 +286,6 @@ def test_hit_cluster_section_from_file(mocker, config_parser_dict, expected_dict ("master_root_volume_size", "31", 31, None), # Compute fleet # TODO add regex for compute_instance_type - ("compute_instance_type", None, "t2.micro", None), ("compute_instance_type", "", "", None), ("compute_instance_type", "test", "test", None), ("compute_instance_type", "NONE", "NONE", None), @@ -543,7 +547,6 @@ def test_sit_cluster_param_from_file( ("shared_dir", "NONE", "NONE", None), # NONE is evaluated as a valid path # Head node # TODO add regex for master_instance_type - ("master_instance_type", None, "t2.micro", None), ("master_instance_type", "", "", None), ("master_instance_type", "test", "test", None), ("master_instance_type", "NONE", "NONE", None), @@ -801,6 +804,9 @@ def test_sit_cluster_section_to_file(mocker, section_dict, expected_config_parse def test_cluster_section_to_cfn( mocker, cluster_section_definition, section_dict, expected_cfn_params, default_threads_per_core ): + section_dict["master_instance_type"] = "t2.micro" + if cluster_section_definition == CLUSTER_SIT: + section_dict["compute_instance_type"] = "t2.micro" utils.set_default_values_for_required_cluster_section_params(section_dict) utils.mock_pcluster_config(mocker) mocker.patch("pcluster.config.cfn_param_types.get_efs_mount_target_id", return_value="valid_mount_target_id") diff --git a/cli/tests/pcluster/config/test_utils.py b/cli/tests/pcluster/config/test_utils.py new file mode 100644 index 0000000000..88ae99f255 --- /dev/null +++ b/cli/tests/pcluster/config/test_utils.py @@ -0,0 +1,59 @@ +# Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance +# with the License. A copy of the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES +# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and +# limitations under the License. +import os + +import pytest +from assertpy import assert_that + +from pcluster.utils import get_default_instance_type +from tests.common import MockedBoto3Request + + +@pytest.fixture() +def boto3_stubber_path(): + return "pcluster.utils.boto3" + + +@pytest.mark.parametrize( + "region, free_tier_instance_type, default_instance_type, stub_boto3", + [ + ("us-east-1", "t2.micro", "t2.micro", True), + ("eu-north-1", "t3.micro", "t3.micro", True), + ("us-gov-east-1", None, "t3.micro", True), + # Retrieving free tier instance type again should use cache to reduce boto3 call + ("us-east-1", "t2.micro", "t2.micro", False), + ("eu-north-1", "t3.micro", "t3.micro", False), + ("us-gov-east-1", None, "t3.micro", False), + ], +) +@pytest.mark.nomockdefaultinstance +def test_get_default_instance(boto3_stubber, region, free_tier_instance_type, default_instance_type, stub_boto3): + os.environ["AWS_DEFAULT_REGION"] = region + if free_tier_instance_type: + response = {"InstanceTypes": [{"InstanceType": free_tier_instance_type}]} + else: + response = {"InstanceTypes": []} + if stub_boto3: + mocked_requests = [ + MockedBoto3Request( + method="describe_instance_types", + response=response, + expected_params={ + "Filters": [ + {"Name": "free-tier-eligible", "Values": ["true"]}, + {"Name": "current-generation", "Values": ["true"]}, + ] + }, + ) + ] + + boto3_stubber("ec2", mocked_requests) + assert_that(get_default_instance_type()).is_equal_to(default_instance_type) diff --git a/cli/tests/pcluster/configure/test_pcluster_configure.py b/cli/tests/pcluster/configure/test_pcluster_configure.py index 5da9455901..cb4b43b12c 100644 --- a/cli/tests/pcluster/configure/test_pcluster_configure.py +++ b/cli/tests/pcluster/configure/test_pcluster_configure.py @@ -382,6 +382,7 @@ def __init__(self, mocker, empty_region=False, partition="commercial", mock_avai _mock_list_vpcs_and_subnets(self.mocker, empty_region, partition) _mock_parallel_cluster_config(self.mocker) _mock_cache_availability_zones(self.mocker) + mocker.patch("pcluster.configure.easyconfig.get_default_instance_type", return_value="t2.micro") if mock_availability_zone: _mock_availability_zone(self.mocker) diff --git a/cli/tests/pcluster/configure/test_pcluster_configure/test_bad_config_file/pcluster.config.ini b/cli/tests/pcluster/configure/test_pcluster_configure/test_bad_config_file/pcluster.config.ini index 584ef221cd..4bbcff5504 100644 --- a/cli/tests/pcluster/configure/test_pcluster_configure/test_bad_config_file/pcluster.config.ini +++ b/cli/tests/pcluster/configure/test_pcluster_configure/test_bad_config_file/pcluster.config.ini @@ -14,8 +14,7 @@ vpc_settings = default # Implied value scheduler = sge base_os = centos7 -# Implied value -# compute_instance_type = t2.micro +compute_instance_type = t2.micro master_instance_type = t2.nano #Invalid value type max_queue_size = 14 diff --git a/cli/tests/pcluster/configure/test_pcluster_configure/test_no_automation_no_awsbatch_no_errors/pcluster.config.ini b/cli/tests/pcluster/configure/test_pcluster_configure/test_no_automation_no_awsbatch_no_errors/pcluster.config.ini index e2fe3749f1..1f4800a4ca 100644 --- a/cli/tests/pcluster/configure/test_pcluster_configure/test_no_automation_no_awsbatch_no_errors/pcluster.config.ini +++ b/cli/tests/pcluster/configure/test_pcluster_configure/test_no_automation_no_awsbatch_no_errors/pcluster.config.ini @@ -6,6 +6,7 @@ key_name = key1 vpc_settings = default scheduler = torque master_instance_type = t2.nano +compute_instance_type = t2.micro max_queue_size = 14 initial_queue_size = 13 maintain_initial_size = true diff --git a/cli/tests/pcluster/configure/test_pcluster_configure/test_no_automation_yes_awsbatch_no_errors/pcluster.config.ini b/cli/tests/pcluster/configure/test_pcluster_configure/test_no_automation_yes_awsbatch_no_errors/pcluster.config.ini index 95356e94e9..8c696b839e 100644 --- a/cli/tests/pcluster/configure/test_pcluster_configure/test_no_automation_yes_awsbatch_no_errors/pcluster.config.ini +++ b/cli/tests/pcluster/configure/test_pcluster_configure/test_no_automation_yes_awsbatch_no_errors/pcluster.config.ini @@ -7,8 +7,7 @@ key_name = key1 base_os = alinux2 vpc_settings = default scheduler = awsbatch -# Implied value -# compute_instance_type = optimal +compute_instance_type = optimal master_instance_type = t2.nano max_vcpus = 14 min_vcpus = 13 diff --git a/cli/tests/pcluster/configure/test_pcluster_configure/test_region_env_overwrite_region_config/pcluster.config.ini b/cli/tests/pcluster/configure/test_pcluster_configure/test_region_env_overwrite_region_config/pcluster.config.ini index 639f3e57a5..48e4bf5524 100644 --- a/cli/tests/pcluster/configure/test_pcluster_configure/test_region_env_overwrite_region_config/pcluster.config.ini +++ b/cli/tests/pcluster/configure/test_pcluster_configure/test_region_env_overwrite_region_config/pcluster.config.ini @@ -6,6 +6,8 @@ key_name = key3 vpc_settings = default scheduler = torque base_os = alinux2 +master_instance_type = t2.micro +compute_instance_type = t2.micro [vpc default] vpc_id = vpc-12345678 diff --git a/cli/tests/pcluster/configure/test_pcluster_configure/test_subnet_automation_no_awsbatch_no_errors/pcluster.config.ini b/cli/tests/pcluster/configure/test_pcluster_configure/test_subnet_automation_no_awsbatch_no_errors/pcluster.config.ini index d00170d6f6..f663bc15fc 100644 --- a/cli/tests/pcluster/configure/test_pcluster_configure/test_subnet_automation_no_awsbatch_no_errors/pcluster.config.ini +++ b/cli/tests/pcluster/configure/test_pcluster_configure/test_subnet_automation_no_awsbatch_no_errors/pcluster.config.ini @@ -7,8 +7,7 @@ vpc_settings = default # Implied value scheduler = sge base_os = centos7 -# Implied value -# compute_instance_type = t2.micro +compute_instance_type = t2.micro master_instance_type = t2.nano max_queue_size = 14 initial_queue_size = 13 diff --git a/cli/tests/pcluster/configure/test_pcluster_configure/test_subnet_automation_no_awsbatch_no_errors_empty_vpc/pcluster.config.ini b/cli/tests/pcluster/configure/test_pcluster_configure/test_subnet_automation_no_awsbatch_no_errors_empty_vpc/pcluster.config.ini index fc7f4ed2b3..bd2b285434 100644 --- a/cli/tests/pcluster/configure/test_pcluster_configure/test_subnet_automation_no_awsbatch_no_errors_empty_vpc/pcluster.config.ini +++ b/cli/tests/pcluster/configure/test_pcluster_configure/test_subnet_automation_no_awsbatch_no_errors_empty_vpc/pcluster.config.ini @@ -6,8 +6,7 @@ key_name = key1 vpc_settings = default scheduler = sge base_os = centos7 -# Implied value -# compute_instance_type = t2.micro +compute_instance_type = t2.micro master_instance_type = t2.nano max_queue_size = 14 initial_queue_size = 13 diff --git a/cli/tests/pcluster/configure/test_pcluster_configure/test_subnet_automation_no_awsbatch_no_errors_with_config_file/pcluster.config.ini b/cli/tests/pcluster/configure/test_pcluster_configure/test_subnet_automation_no_awsbatch_no_errors_with_config_file/pcluster.config.ini index fff2041ee2..a401a06e00 100644 --- a/cli/tests/pcluster/configure/test_pcluster_configure/test_subnet_automation_no_awsbatch_no_errors_with_config_file/pcluster.config.ini +++ b/cli/tests/pcluster/configure/test_pcluster_configure/test_subnet_automation_no_awsbatch_no_errors_with_config_file/pcluster.config.ini @@ -7,8 +7,7 @@ vpc_settings = default # Implied value scheduler = sge base_os = centos7 -# Implied value -# compute_instance_type = t2.micro +compute_instance_type = t2.micro master_instance_type = t2.nano max_queue_size = 14 initial_queue_size = 13 diff --git a/cli/tests/pcluster/configure/test_pcluster_configure/test_subnet_automation_yes_awsbatch_invalid_vpc/pcluster.config.ini b/cli/tests/pcluster/configure/test_pcluster_configure/test_subnet_automation_yes_awsbatch_invalid_vpc/pcluster.config.ini index 59256eeabd..170494afc4 100644 --- a/cli/tests/pcluster/configure/test_pcluster_configure/test_subnet_automation_yes_awsbatch_invalid_vpc/pcluster.config.ini +++ b/cli/tests/pcluster/configure/test_pcluster_configure/test_subnet_automation_yes_awsbatch_invalid_vpc/pcluster.config.ini @@ -5,9 +5,8 @@ aws_region_name = eu-west-1 key_name = key1 vpc_settings = default scheduler = awsbatch -# Implied value base_os = alinux2 -# compute_instance_type = optimal +compute_instance_type = optimal master_instance_type = t2.nano max_vcpus = 14 min_vcpus = 13 diff --git a/cli/tests/pcluster/configure/test_pcluster_configure/test_vpc_automation_no_awsbatch_no_errors/pcluster.config.ini b/cli/tests/pcluster/configure/test_pcluster_configure/test_vpc_automation_no_awsbatch_no_errors/pcluster.config.ini index d00170d6f6..f663bc15fc 100644 --- a/cli/tests/pcluster/configure/test_pcluster_configure/test_vpc_automation_no_awsbatch_no_errors/pcluster.config.ini +++ b/cli/tests/pcluster/configure/test_pcluster_configure/test_vpc_automation_no_awsbatch_no_errors/pcluster.config.ini @@ -7,8 +7,7 @@ vpc_settings = default # Implied value scheduler = sge base_os = centos7 -# Implied value -# compute_instance_type = t2.micro +compute_instance_type = t2.micro master_instance_type = t2.nano max_queue_size = 14 initial_queue_size = 13 diff --git a/cli/tests/pcluster/configure/test_pcluster_configure/test_vpc_automation_yes_awsbatch_no_errors/pcluster.config.ini b/cli/tests/pcluster/configure/test_pcluster_configure/test_vpc_automation_yes_awsbatch_no_errors/pcluster.config.ini index 59256eeabd..170494afc4 100644 --- a/cli/tests/pcluster/configure/test_pcluster_configure/test_vpc_automation_yes_awsbatch_no_errors/pcluster.config.ini +++ b/cli/tests/pcluster/configure/test_pcluster_configure/test_vpc_automation_yes_awsbatch_no_errors/pcluster.config.ini @@ -5,9 +5,8 @@ aws_region_name = eu-west-1 key_name = key1 vpc_settings = default scheduler = awsbatch -# Implied value base_os = alinux2 -# compute_instance_type = optimal +compute_instance_type = optimal master_instance_type = t2.nano max_vcpus = 14 min_vcpus = 13 diff --git a/cli/tests/pcluster/configure/test_pcluster_configure/test_with_region_arg_with_config_file/pcluster.config.ini b/cli/tests/pcluster/configure/test_pcluster_configure/test_with_region_arg_with_config_file/pcluster.config.ini index d80f4a8a9f..9b38fa7ec4 100644 --- a/cli/tests/pcluster/configure/test_pcluster_configure/test_with_region_arg_with_config_file/pcluster.config.ini +++ b/cli/tests/pcluster/configure/test_pcluster_configure/test_with_region_arg_with_config_file/pcluster.config.ini @@ -14,6 +14,7 @@ key_name = key1 base_os = alinux scheduler = torque master_instance_type = t2.nano +compute_instance_type = t2.micro vpc_settings = default initial_queue_size = 13 max_queue_size = 14 diff --git a/cli/tests/pcluster_config/test_pcluster_config_convert/test_slurm_sit_full/expected_output.ini b/cli/tests/pcluster_config/test_pcluster_config_convert/test_slurm_sit_full/expected_output.ini index 78222099bb..4aa0eafb7c 100644 --- a/cli/tests/pcluster_config/test_pcluster_config_convert/test_slurm_sit_full/expected_output.ini +++ b/cli/tests/pcluster_config/test_pcluster_config_convert/test_slurm_sit_full/expected_output.ini @@ -49,10 +49,10 @@ vpc_security_group_id = sg-0fa8d8e11dc6e9491 [cluster slurm-sit-full] key_name = test-key -base_os = centos7 scheduler = slurm master_instance_type = t2.large master_root_volume_size = 30 +base_os = centos7 compute_root_volume_size = 30 proxy_server = proxy ec2_iam_role = role diff --git a/cli/tests/pcluster_config/test_pcluster_config_convert/test_slurm_sit_simple/expected_output.ini b/cli/tests/pcluster_config/test_pcluster_config_convert/test_slurm_sit_simple/expected_output.ini index e27b63153c..909995b851 100644 --- a/cli/tests/pcluster_config/test_pcluster_config_convert/test_slurm_sit_simple/expected_output.ini +++ b/cli/tests/pcluster_config/test_pcluster_config_convert/test_slurm_sit_simple/expected_output.ini @@ -15,9 +15,9 @@ sanity_check = true [cluster slurm-sit-simple] key_name = test -base_os = centos7 scheduler = slurm master_instance_type = c5.2xlarge +base_os = centos7 shared_dir = /test vpc_settings = public additional_iam_policies = arn:aws:iam::aws:policy/CloudWatchFullAccess diff --git a/cli/tests/pcluster_config/test_pcluster_config_convert/test_slurm_unrelated_sections/expected_output.ini b/cli/tests/pcluster_config/test_pcluster_config_convert/test_slurm_unrelated_sections/expected_output.ini index 43d20fa990..1baaa68787 100644 --- a/cli/tests/pcluster_config/test_pcluster_config_convert/test_slurm_unrelated_sections/expected_output.ini +++ b/cli/tests/pcluster_config/test_pcluster_config_convert/test_slurm_unrelated_sections/expected_output.ini @@ -49,10 +49,10 @@ vpc_security_group_id = sg-0fa8d8e11dc6e9491 [cluster slurm-sit-full] key_name = test-key -base_os = centos7 scheduler = slurm master_instance_type = t2.large master_root_volume_size = 30 +base_os = centos7 compute_root_volume_size = 30 proxy_server = proxy ec2_iam_role = role diff --git a/cloudformation/aws-parallelcluster.cfn.json b/cloudformation/aws-parallelcluster.cfn.json index 61f9f5dd34..e71999af29 100644 --- a/cloudformation/aws-parallelcluster.cfn.json +++ b/cloudformation/aws-parallelcluster.cfn.json @@ -9,13 +9,11 @@ "MasterInstanceType": { "Description": "Head node EC2 instance type", "Type": "String", - "Default": "t2.micro", "ConstraintDescription": "Must be a valid EC2 instance type, with support for HVM." }, "ComputeInstanceType": { "Description": "ComputeFleet EC2 instance type", "Type": "String", - "Default": "t2.micro", "ConstraintDescription": "Must be a valid EC2 instance type, with support for HVM." }, "MinSize": { diff --git a/tests/integration-tests/configs/common/common.yaml b/tests/integration-tests/configs/common/common.yaml index f51be906e7..d30339c18f 100644 --- a/tests/integration-tests/configs/common/common.yaml +++ b/tests/integration-tests/configs/common/common.yaml @@ -69,6 +69,11 @@ configure: instances: {{ common.INSTANCES_DEFAULT_X86 }} oss: ["alinux2"] schedulers: ["slurm"] + test_pcluster_configure.py::test_region_without_t2micro: + dimensions: + - regions: ["eu-north-1"] # must be regions that do not have t2.micro + oss: ["centos7"] + schedulers: ["slurm"] create: test_create.py::test_create_wrong_os: dimensions: diff --git a/tests/integration-tests/tests/configure/test_pcluster_configure.py b/tests/integration-tests/tests/configure/test_pcluster_configure.py index 0fa7bd180c..bc00100247 100644 --- a/tests/integration-tests/tests/configure/test_pcluster_configure.py +++ b/tests/integration-tests/tests/configure/test_pcluster_configure.py @@ -41,7 +41,6 @@ def test_pcluster_configure( vpc_stack.cfn_outputs["VpcId"], vpc_stack.cfn_outputs["PublicSubnetId"], vpc_stack.cfn_outputs["PrivateSubnetId"], - vpc_stack, ) assert_configure_workflow(region, stages, config_path) assert_config_contains_expected_values( @@ -92,7 +91,6 @@ def test_pcluster_configure_avoid_bad_subnets( # and use the first subnet in the remaining list of subnets "", "", - vpc_stack, omitted_subnets_num=1, ) assert_configure_workflow(region, stages, config_path) @@ -109,6 +107,44 @@ def test_pcluster_configure_avoid_bad_subnets( ) +def test_region_without_t2micro( + vpc_stack, + pcluster_config_reader, + key_name, + region, + os, + scheduler, + test_datadir, +): + """ + Verify the default instance type (free tier) is retrieved dynamically according to region. + In other words, t3.micro is retrieved when the region does not contain t2.micro + """ + config_path = test_datadir / "config.ini" + stages = orchestrate_pcluster_configure_stages( + region, + key_name, + scheduler, + os, + "", + vpc_stack.cfn_outputs["VpcId"], + vpc_stack.cfn_outputs["PublicSubnetId"], + vpc_stack.cfn_outputs["PrivateSubnetId"], + ) + assert_configure_workflow(region, stages, config_path) + assert_config_contains_expected_values( + region, + key_name, + scheduler, + os, + "", + vpc_stack.cfn_outputs["VpcId"], + vpc_stack.cfn_outputs["PublicSubnetId"], + vpc_stack.cfn_outputs["PrivateSubnetId"], + config_path, + ) + + def skip_if_unsupported_test_options_were_used(request): unsupported_options = get_unsupported_test_runner_options(request) if unsupported_options: @@ -205,7 +241,6 @@ def orchestrate_pcluster_configure_stages( vpc_id, headnode_subnet_id, compute_subnet_id, - vpc_stack, omitted_subnets_num=0, ): compute_units = "vcpus" if scheduler == "awsbatch" else "instances" @@ -220,8 +255,8 @@ def orchestrate_pcluster_configure_stages( {"prompt": r"Operating System \[alinux2\]: ", "response": os, "skip_for_batch": True}, {"prompt": fr"Minimum cluster size \({compute_units}\) \[0\]: ", "response": "1"}, {"prompt": fr"Maximum cluster size \({compute_units}\) \[10\]: ", "response": ""}, - {"prompt": r"Head node instance type \[t2\.micro\]: ", "response": instance}, - {"prompt": r"Compute instance type \[t2\.micro\]: ", "response": instance, "skip_for_batch": True}, + {"prompt": r"Head node instance type \[t.\.micro\]: ", "response": instance}, + {"prompt": r"Compute instance type \[t.\.micro\]: ", "response": instance, "skip_for_batch": True}, {"prompt": r"Automate VPC creation\? \(y/n\) \[n\]: ", "response": "n"}, {"prompt": r"VPC ID \[vpc-.+\]: ", "response": vpc_id}, {"prompt": r"Automate Subnet creation\? \(y/n\) \[y\]: ", "response": "n"}, From ad7017177e3ad7a76d49ded0ea29aaa8352c4535 Mon Sep 17 00:00:00 2001 From: Luca Carrogu Date: Fri, 27 Nov 2020 09:28:22 +0100 Subject: [PATCH 24/66] Move p4d tests on PDX (us-west-2) Move half p4d tests on PDX (us-west-2) Signed-off-by: Luca Carrogu --- tests/integration-tests/configs/common/common.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration-tests/configs/common/common.yaml b/tests/integration-tests/configs/common/common.yaml index d30339c18f..af18d578c1 100644 --- a/tests/integration-tests/configs/common/common.yaml +++ b/tests/integration-tests/configs/common/common.yaml @@ -535,7 +535,7 @@ multiple_nics: instances: ["p4d.24xlarge"] oss: ["alinux2", "ubuntu1604", "centos8"] schedulers: ["slurm"] - - regions: ["us-east-1"] + - regions: ["us-west-2"] instances: ["p4d.24xlarge"] oss: ["alinux", "ubuntu1804", "centos7"] schedulers: ["slurm"] From 57f56b28eb7214403e44e48aa52e3d39df7f6d5d Mon Sep 17 00:00:00 2001 From: ddeidda <53186409+ddeidda@users.noreply.github.com> Date: Thu, 3 Dec 2020 16:11:08 +0100 Subject: [PATCH 25/66] Restore default values in cfn template for ComputeInstanceType (#2284) Compute instance type parameter is not rendered if scheduler is Slurm. This caused the error `Parameters: [ComputeInstanceType] must have values` in CloudFormation because a value was still expected. With this commit we set "NONE" as default to prevent this value being silently used as if set by the user. Signed-off-by: ddeidda --- cli/tests/pcluster/config/test_source_consistency.py | 4 ++++ cloudformation/aws-parallelcluster.cfn.json | 1 + 2 files changed, 5 insertions(+) diff --git a/cli/tests/pcluster/config/test_source_consistency.py b/cli/tests/pcluster/config/test_source_consistency.py index 7ea08d5aa3..b177fa22e8 100644 --- a/cli/tests/pcluster/config/test_source_consistency.py +++ b/cli/tests/pcluster/config/test_source_consistency.py @@ -88,6 +88,10 @@ def test_defaults_consistency(): # metadata is generated dynamically based on user's configuration. ignored_params += ["ClusterConfigMetadata"] + # ComputeInstanceType parameter is expected to differ from the default value in the CFN template because + # it is dynamically generated based on the AWS region + ignored_params += ["ComputeInstanceType"] + cfn_params = [section_cfn_params.value for section_cfn_params in DefaultCfnParams] default_cfn_values = utils.merge_dicts(*cfn_params) diff --git a/cloudformation/aws-parallelcluster.cfn.json b/cloudformation/aws-parallelcluster.cfn.json index e71999af29..ad01d2604f 100644 --- a/cloudformation/aws-parallelcluster.cfn.json +++ b/cloudformation/aws-parallelcluster.cfn.json @@ -14,6 +14,7 @@ "ComputeInstanceType": { "Description": "ComputeFleet EC2 instance type", "Type": "String", + "Default": "NONE", "ConstraintDescription": "Must be a valid EC2 instance type, with support for HVM." }, "MinSize": { From 62bfc1af84f5288ebb2c59c08aea92c5ed345785 Mon Sep 17 00:00:00 2001 From: Luca Carrogu Date: Fri, 4 Dec 2020 11:26:36 +0100 Subject: [PATCH 26/66] Change second queue instance type from c4.xlarge to c5.large Reason for this change is that not all the regions support c4.xlarge. C5 family support is broader What does this change solve? It allows to run the test where C4 isn't present Signed-off-by: Luca Carrogu --- .../tests/scaling/test_scaling.py | 33 ++++++++++--------- .../test_hit_scaling/pcluster.config.ini | 4 +-- 2 files changed, 19 insertions(+), 18 deletions(-) diff --git a/tests/integration-tests/tests/scaling/test_scaling.py b/tests/integration-tests/tests/scaling/test_scaling.py index 8308a6dada..4e3b8038b8 100644 --- a/tests/integration-tests/tests/scaling/test_scaling.py +++ b/tests/integration-tests/tests/scaling/test_scaling.py @@ -126,16 +126,16 @@ def test_nodewatcher_terminates_failing_node(scheduler, region, pcluster_config_ @pytest.mark.instances(["c5.xlarge"]) @pytest.mark.schedulers(["slurm"]) @pytest.mark.oss(["alinux2", "centos7", "centos8", "ubuntu1804"]) -@pytest.mark.usefixtures("region", "os", "instance") +@pytest.mark.usefixtures("region", "os") @pytest.mark.hit_scaling -def test_hit_scaling(scheduler, region, pcluster_config_reader, clusters_factory, test_datadir): +def test_hit_scaling(scheduler, region, instance, pcluster_config_reader, clusters_factory, test_datadir): """Test that slurm-specific scaling logic is resistent to manual actions and failures.""" cluster_config = pcluster_config_reader(scaledown_idletime=3) cluster = clusters_factory(cluster_config) remote_command_executor = RemoteCommandExecutor(cluster) scheduler_commands = get_scheduler_commands(scheduler, remote_command_executor) - _assert_cluster_initial_conditions(scheduler_commands) + _assert_cluster_initial_conditions(scheduler_commands, instance) _test_partition_states( scheduler_commands, cluster.cfn_name, @@ -144,7 +144,7 @@ def test_hit_scaling(scheduler, region, pcluster_config_reader, clusters_factory inactive_partition="ondemand2", num_static_nodes=2, num_dynamic_nodes=3, - dynamic_instance_type="c5.xlarge", + dynamic_instance_type=instance, ) _test_reset_terminated_nodes( scheduler_commands, @@ -153,7 +153,7 @@ def test_hit_scaling(scheduler, region, pcluster_config_reader, clusters_factory partition="ondemand1", num_static_nodes=2, num_dynamic_nodes=3, - dynamic_instance_type="c5.xlarge", + dynamic_instance_type=instance, ) _test_replace_down_nodes( remote_command_executor, @@ -164,7 +164,7 @@ def test_hit_scaling(scheduler, region, pcluster_config_reader, clusters_factory partition="ondemand1", num_static_nodes=2, num_dynamic_nodes=3, - dynamic_instance_type="c5.xlarge", + dynamic_instance_type=instance, ) _test_keep_or_replace_suspended_nodes( scheduler_commands, @@ -173,7 +173,7 @@ def test_hit_scaling(scheduler, region, pcluster_config_reader, clusters_factory partition="ondemand1", num_static_nodes=2, num_dynamic_nodes=3, - dynamic_instance_type="c5.xlarge", + dynamic_instance_type=instance, ) _test_computemgtd_logic( remote_command_executor, @@ -184,29 +184,30 @@ def test_hit_scaling(scheduler, region, pcluster_config_reader, clusters_factory partition="ondemand1", num_static_nodes=2, num_dynamic_nodes=3, - dynamic_instance_type="c5.xlarge", + dynamic_instance_type=instance, ) assert_no_errors_in_logs(remote_command_executor, scheduler) -def _assert_cluster_initial_conditions(scheduler_commands): +def _assert_cluster_initial_conditions(scheduler_commands, instance): """Assert that expected nodes are in cluster.""" cluster_node_states = scheduler_commands.get_nodes_status() - c4_nodes, c5_nodes, static_nodes, dynamic_nodes = [], [], [], [] + c5l_nodes, instance_nodes, static_nodes, dynamic_nodes = [], [], [], [] logging.info(cluster_node_states) for nodename, node_states in cluster_node_states.items(): - if "c4" in nodename: - c4_nodes.append(nodename) - if "c5" in nodename: - c5_nodes.append(nodename) + if "c5l" in nodename: + c5l_nodes.append(nodename) + # "c5.xlarge"[: "c5.xlarge".index(".")+2].replace(".", "") = c5x + if instance[: instance.index(".") + 2].replace(".", "") in nodename: + instance_nodes.append(nodename) if node_states == "idle": if "-st-" in nodename: static_nodes.append(nodename) if "-dy-" in nodename: dynamic_nodes.append(nodename) - assert_that(len(c4_nodes)).is_equal_to(20) - assert_that(len(c5_nodes)).is_equal_to(20) + assert_that(len(c5l_nodes)).is_equal_to(20) + assert_that(len(instance_nodes)).is_equal_to(20) assert_that(len(static_nodes)).is_equal_to(4) assert_that(len(dynamic_nodes)).is_equal_to(1) diff --git a/tests/integration-tests/tests/scaling/test_scaling/test_hit_scaling/pcluster.config.ini b/tests/integration-tests/tests/scaling/test_scaling/test_hit_scaling/pcluster.config.ini index 1d2563852a..6635fa44c8 100644 --- a/tests/integration-tests/tests/scaling/test_scaling/test_hit_scaling/pcluster.config.ini +++ b/tests/integration-tests/tests/scaling/test_scaling/test_hit_scaling/pcluster.config.ini @@ -20,14 +20,14 @@ compute_resource_settings = ondemand_i1,ondemand_i2 compute_resource_settings = ondemand_i3,ondemand_i4 [compute_resource ondemand_i1] -instance_type = c4.xlarge +instance_type = c5.large [compute_resource ondemand_i2] instance_type = {{ instance }} min_count = 2 [compute_resource ondemand_i3] -instance_type = c4.xlarge +instance_type = c5.large [compute_resource ondemand_i4] instance_type = {{ instance }} From 8d696fbf1c56e284c2abcf74d8fa029871b831bc Mon Sep 17 00:00:00 2001 From: Luca Carrogu Date: Fri, 4 Dec 2020 14:23:21 +0100 Subject: [PATCH 27/66] Remove awsbatch access test if scheduler is not awsbatch What does the change solve? The change allows to run the iam policies test on the regions where AWS Batch is not present Signed-off-by: Luca Carrogu --- .../integration-tests/tests/iam_policies/test_iam_policies.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/integration-tests/tests/iam_policies/test_iam_policies.py b/tests/integration-tests/tests/iam_policies/test_iam_policies.py index 038b6c3263..f14a49dd19 100644 --- a/tests/integration-tests/tests/iam_policies/test_iam_policies.py +++ b/tests/integration-tests/tests/iam_policies/test_iam_policies.py @@ -31,7 +31,9 @@ def test_iam_policies(region, scheduler, pcluster_config_reader, clusters_factor remote_command_executor = RemoteCommandExecutor(cluster) _test_s3_access(remote_command_executor, region) - _test_batch_access(remote_command_executor, region) + + if scheduler == "awsbatch": + _test_batch_access(remote_command_executor, region) assert_no_errors_in_logs(remote_command_executor, scheduler) From a4f9825a8a122415da96173fd8b47c9349c6380f Mon Sep 17 00:00:00 2001 From: Luca Carrogu Date: Thu, 3 Dec 2020 19:01:20 +0100 Subject: [PATCH 28/66] Add test configuration file for new region support This config will be used as test bed for new region. Signed-off-by: Luca Carrogu --- .../integration-tests/configs/new_region.yaml | 169 ++++++++++++++++++ 1 file changed, 169 insertions(+) create mode 100644 tests/integration-tests/configs/new_region.yaml diff --git a/tests/integration-tests/configs/new_region.yaml b/tests/integration-tests/configs/new_region.yaml new file mode 100644 index 0000000000..e77e1f3ed0 --- /dev/null +++ b/tests/integration-tests/configs/new_region.yaml @@ -0,0 +1,169 @@ +{%- import 'common.jinja2' as common -%} +{%- set NEW_REGION = ["##PLACEHOLDER##"] -%} +--- +test-suites: + scaling: + test_scaling.py::test_multiple_jobs_submission: + dimensions: + - regions: {{ NEW_REGION }} + instances: {{ common.INSTANCES_DEFAULT_X86 }} + oss: {{ common.OSS_COMMERCIAL_X86 }} + schedulers: {{ common.SCHEDULERS_TRAD }} + - regions: {{ NEW_REGION }} + instances: {{ common.INSTANCES_DEFAULT_ARM }} + oss: {{ common.OSS_COMMERCIAL_ARM }} + schedulers: {{ common.SCHEDULERS_TRAD }} + test_mpi.py::test_mpi: + dimensions: + - regions: {{ NEW_REGION }} + instances: {{ common.INSTANCES_DEFAULT_X86 }} + oss: {{ common.OSS_COMMERCIAL_X86 }} + schedulers: ["slurm", "sge"] + - regions: {{ NEW_REGION }} + instances: {{ common.INSTANCES_DEFAULT_ARM }} + oss: {{ common.OSS_COMMERCIAL_ARM }} + schedulers: ["slurm", "sge"] + schedulers: + test_awsbatch.py::test_awsbatch: + dimensions: + - regions: {{ NEW_REGION }} + instances: {{ common.INSTANCES_DEFAULT_X86 }} + oss: ["alinux2"] + schedulers: ["awsbatch"] + - regions: {{ NEW_REGION }} + instances: {{ common.INSTANCES_DEFAULT_ARM }} + oss: ["alinux2"] + schedulers: ["awsbatch"] + cloudwatch_logging: + test_cloudwatch_logging.py::test_cloudwatch_logging: + dimensions: + - regions: {{ NEW_REGION }} + instances: {{ common.INSTANCES_DEFAULT_X86 }} + oss: ["ubuntu1804"] + schedulers: ["slurm"] + configure: + test_pcluster_configure.py::test_pcluster_configure: + dimensions: + - regions: {{ NEW_REGION }} + instances: {{ common.INSTANCES_DEFAULT_X86 }} + oss: {{ common.OSS_ONE_PER_DISTRO }} + schedulers: {{ common.SCHEDULERS_TRAD }} + cli_commands: + test_cli_commands.py::test_hit_cli_commands: + dimensions: + - regions: {{ NEW_REGION }} + instances: {{ common.INSTANCES_DEFAULT_X86 }} + oss: ["ubuntu1604"] + schedulers: ["slurm"] + test_cli_commands.py::test_sit_cli_commands: + dimensions: + - regions: {{ NEW_REGION }} + instances: {{ common.INSTANCES_DEFAULT_X86 }} + oss: ["centos7"] + schedulers: ["sge"] + update: + test_update.py::test_update_hit: + dimensions: + - regions: {{ NEW_REGION }} + instances: {{ common.INSTANCES_DEFAULT_X86 }} + oss: ["alinux2"] + schedulers: ["slurm"] + createami: + test_createami.py::test_createami: + dimensions: + - regions: {{ NEW_REGION }} + instances: {{ common.INSTANCES_DEFAULT_X86 }} + oss: ["alinux", "alinux2", "ubuntu1604", "ubuntu1804"] # temporary disable FPGA AMI since there is not enough free space on root partition + dashboard: + test_dashboard.py::test_dashboard: + dimensions: + - regions: {{ NEW_REGION }} + instances: {{ common.INSTANCES_DEFAULT_X86 }} + oss: ["centos8"] + schedulers: ["slurm"] + dcv: + test_dcv.py::test_dcv_configuration: + dimensions: + # DCV on GPU enabled instance + - regions: {{ NEW_REGION }} + instances: ["g3.8xlarge"] + oss: ["ubuntu1804"] + schedulers: ["slurm"] + # DCV om non GPU enabled instance + - regions: {{ NEW_REGION }} + instances: {{ common.INSTANCES_DEFAULT_X86 }} + oss: ["alinux2"] + schedulers: ["slurm"] + dns: + test_dns.py::test_hit_no_cluster_dns_mpi: + dimensions: + - regions: {{ NEW_REGION }} + instances: {{ common.INSTANCES_DEFAULT_X86 }} + oss: ["alinux2"] + schedulers: ["slurm"] + efa: + test_efa.py::test_hit_efa: + dimensions: + - regions: {{ NEW_REGION }} + instances: ["c5n.18xlarge"] + oss: ["alinux2"] + schedulers: ["slurm"] + iam_policies: + test_iam_policies.py::test_iam_policies: + dimensions: + - regions: {{ NEW_REGION }} + instances: {{ common.INSTANCES_DEFAULT_X86 }} + oss: ["alinux2"] + schedulers: ["slurm"] + networking: + test_cluster_networking.py::test_cluster_in_private_subnet: + dimensions: + - regions: {{ NEW_REGION }} + instances: {{ common.INSTANCES_DEFAULT_X86 }} + oss: ["ubuntu1804"] + schedulers: ["slurm"] + test_networking.py::test_public_network_topology: + dimensions: + - regions: {{ NEW_REGION }} + test_networking.py::test_public_private_network_topology: + dimensions: + - regions: {{ NEW_REGION }} + test_multi_cidr.py::test_multi_cidr: + dimensions: + - regions: {{ NEW_REGION }} + instances: {{ common.INSTANCES_DEFAULT_X86 }} + oss: ["alinux2"] + schedulers: ["slurm"] + spot: + test_spot.py::test_spot_default: + dimensions: + - regions: {{ NEW_REGION }} + instances: {{ common.INSTANCES_DEFAULT_X86 }} + oss: ["centos8"] + schedulers: ["slurm"] + storage: + test_fsx_lustre.py::test_fsx_lustre: + dimensions: + - regions: {{ NEW_REGION }} + instances: {{ common.INSTANCES_DEFAULT_X86 }} + oss: ["alinux"] + schedulers: ["slurm"] + test_efs.py::test_efs_compute_az: + dimensions: + - regions: {{ NEW_REGION }} + instances: {{ common.INSTANCES_DEFAULT_X86 }} + oss: ["alinux2"] + schedulers: ["slurm"] + test_ebs.py::test_ebs_multiple: + dimensions: + - regions: {{ NEW_REGION }} + instances: {{ common.INSTANCES_DEFAULT_X86 }} + oss: ["alinux2"] + schedulers: ["slurm"] + resource_bucket: + test_resource_bucket.py::test_resource_bucket: + dimensions: + - regions: {{ NEW_REGION }} + instances: {{ common.INSTANCES_DEFAULT_X86 }} + oss: ["alinux2"] + schedulers: ["slurm"] From f19699661450f8857f40eb32ecd7a69fdc07f566 Mon Sep 17 00:00:00 2001 From: ddeidda Date: Fri, 4 Dec 2020 12:59:54 +0100 Subject: [PATCH 29/66] Fix Network Interfaces count refresh code The `network_interfaces_count` parameter depends on `compute_instance_type`, hence it could fail if this parameter is not specified in the config file. Since the default instance type will always have 1 network interface we can safely return 1 when compute_instance_type is not specified. Signed-off-by: ddeidda --- cli/src/pcluster/config/cfn_param_types.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/cli/src/pcluster/config/cfn_param_types.py b/cli/src/pcluster/config/cfn_param_types.py index dfdb7f3816..6a0bdca392 100644 --- a/cli/src/pcluster/config/cfn_param_types.py +++ b/cli/src/pcluster/config/cfn_param_types.py @@ -1113,18 +1113,19 @@ def refresh(self): """Compute the number of network interfaces for head node and compute nodes.""" cluster_section = self.pcluster_config.get_section("cluster") scheduler = cluster_section.get_param_value("scheduler") + compute_instance_type = ( + cluster_section.get_param_value("compute_instance_type") + if self.pcluster_config.cluster_model.name == "SIT" and scheduler != "awsbatch" + else None + ) self.value = [ str( InstanceTypeInfo.init_from_instance_type( cluster_section.get_param_value("master_instance_type") ).max_network_interface_count() ), - str( - InstanceTypeInfo.init_from_instance_type( - cluster_section.get_param_value("compute_instance_type") - ).max_network_interface_count() - ) - if self.pcluster_config.cluster_model.name == "SIT" and scheduler != "awsbatch" + str(InstanceTypeInfo.init_from_instance_type(compute_instance_type).max_network_interface_count()) + if compute_instance_type else "1", ] From d7b0f087d08d95370927e7cefaf6c04d7dfb4289 Mon Sep 17 00:00:00 2001 From: Luca Carrogu Date: Fri, 27 Nov 2020 09:50:28 +0100 Subject: [PATCH 30/66] Remove duplicated test and enable p4d.xlarge test with sge The test using p4d.24xlarge with slurm scheduler is already performed by the test_hit_efa test Change test_sit_efa to use sge and move it to us-west-2 Remove warning when using p4d.24xlarge with scheduler != slurm Signed-off-by: Luca Carrogu --- cli/src/pcluster/config/validators.py | 11 ----------- tests/integration-tests/configs/common/common.yaml | 5 ++--- 2 files changed, 2 insertions(+), 14 deletions(-) diff --git a/cli/src/pcluster/config/validators.py b/cli/src/pcluster/config/validators.py index 1e09fff2a7..20f17f277e 100644 --- a/cli/src/pcluster/config/validators.py +++ b/cli/src/pcluster/config/validators.py @@ -1021,17 +1021,6 @@ def compute_instance_type_validator(param_key, param_value, pcluster_config): else: errors, warnings = ec2_instance_type_validator(param_key, param_value, pcluster_config) - if scheduler != "slurm": - # Multiple NICs instance types are currently supported only with Slurm clusters - instance_nics = InstanceTypeInfo.init_from_instance_type(param_value).max_network_interface_count() - if instance_nics > 1: - warnings.append( - "Some services needed to support clusters with instance type '{0}' with multiple " - "network interfaces and job scheduler '{1}' may not yet be generally available. " - "Please refer to https://docs.aws.amazon.com/autoscaling/ec2/userguide/create-launch-template.html " - "for more information.".format(param_value, scheduler) - ) - return errors, warnings diff --git a/tests/integration-tests/configs/common/common.yaml b/tests/integration-tests/configs/common/common.yaml index af18d578c1..f84e873c8d 100644 --- a/tests/integration-tests/configs/common/common.yaml +++ b/tests/integration-tests/configs/common/common.yaml @@ -208,11 +208,10 @@ efa: # Torque is not supported by OpenMPI distributed with EFA # Slurm test is to verify EFA works correctly when using the SIT model in the config file schedulers: ["sge", "slurm"] - # P4d instances are currently not supported in SIT clusters - - regions: ["us-east-1"] + - regions: ["us-west-2"] instances: ["p4d.24xlarge"] oss: ["alinux", "ubuntu1804", "centos7"] - schedulers: ["slurm"] + schedulers: ["sge"] iam_policies: test_iam_policies.py::test_iam_policies: dimensions: From 8cbd3c246e128c685ec736b25dd132f24a71abe4 Mon Sep 17 00:00:00 2001 From: Luca Carrogu Date: Mon, 7 Dec 2020 13:12:09 +0100 Subject: [PATCH 31/66] Change queue instances type from c4.xlarge to c5.large Reason for this change is that not all the regions support c4.xlarge. C5 family support is broader What does this change solve? It allows to run the test where C4 isn't present Signed-off-by: Luca Carrogu --- .../test_hit_cli_commands/pcluster.config.ini | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/integration-tests/tests/cli_commands/test_cli_commands/test_hit_cli_commands/pcluster.config.ini b/tests/integration-tests/tests/cli_commands/test_cli_commands/test_hit_cli_commands/pcluster.config.ini index 5a5c2ed474..a6a795cfe1 100644 --- a/tests/integration-tests/tests/cli_commands/test_cli_commands/test_hit_cli_commands/pcluster.config.ini +++ b/tests/integration-tests/tests/cli_commands/test_cli_commands/test_hit_cli_commands/pcluster.config.ini @@ -20,14 +20,14 @@ compute_resource_settings = ondemand_i1,ondemand_i2 compute_resource_settings = ondemand_i3,ondemand_i4 [compute_resource ondemand_i1] -instance_type = c4.xlarge +instance_type = c5.large [compute_resource ondemand_i2] instance_type = {{ instance }} min_count = 1 [compute_resource ondemand_i3] -instance_type = c4.xlarge +instance_type = c5.large [compute_resource ondemand_i4] instance_type = {{ instance }} From d8febe90cb16582bfff8a8fd230a617b692e8dc6 Mon Sep 17 00:00:00 2001 From: Hanwen <68928867+hanwen-pcluste@users.noreply.github.com> Date: Mon, 7 Dec 2020 13:27:48 -0500 Subject: [PATCH 32/66] Mandate the presence of vpc_settings, vpc_id, master_subnet_id in the config file (#2276) These parameters have been essential to create a cluster/AMI. Before this commit, the code does not check the existence of the parameters, causing unhandled exceptions if a config file does not have any of the parameters. This commit adds `OrderedDict` to VPC section in `mappings.py` to make the code compatible with Python <= 3.5 Signed-off-by: Hanwen --- CHANGELOG.md | 2 + cli/src/pcluster/config/mappings.py | 47 ++++++++++--------- .../pcluster/models/hit/hit_cluster_model.py | 2 +- .../pcluster/models/sit/sit_cluster_model.py | 4 +- cli/tests/pcluster/config/utils.py | 41 ++++++++-------- 5 files changed, 50 insertions(+), 46 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1c1e25cb86..a76a5dde29 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -21,6 +21,8 @@ CHANGELOG **BUG FIXES** +- Mandate the presence of `vpc_settings`, `vpc_id`, `master_subnet_id` in the config file to avoid unhandled exceptions. + 2.10.0 ------ diff --git a/cli/src/pcluster/config/mappings.py b/cli/src/pcluster/config/mappings.py index 768ec3255b..7479626bc6 100644 --- a/cli/src/pcluster/config/mappings.py +++ b/cli/src/pcluster/config/mappings.py @@ -240,68 +240,70 @@ "key": "vpc", "default_label": "default", "autocreate": True, - "params": { - "vpc_id": { + "params": OrderedDict([ + ("vpc_id", { "cfn_param_mapping": "VPCId", + "required": True, "allowed_values": ALLOWED_VALUES["vpc_id"], "validators": [ec2_vpc_id_validator], "update_policy": UpdatePolicy.UNSUPPORTED - }, - "master_subnet_id": { + }), + ("master_subnet_id", { "cfn_param_mapping": "MasterSubnetId", + "required": True, "allowed_values": ALLOWED_VALUES["subnet_id"], "validators": [ec2_subnet_id_validator], "update_policy": UpdatePolicy.UNSUPPORTED - }, - "ssh_from": { + }), + ("ssh_from", { "default": CIDR_ALL_IPS, "allowed_values": ALLOWED_VALUES["cidr"], "cfn_param_mapping": "AccessFrom", "update_policy": UpdatePolicy.SUPPORTED - }, - "additional_sg": { + }), + ("additional_sg", { "cfn_param_mapping": "AdditionalSG", "allowed_values": ALLOWED_VALUES["security_group_id"], "validators": [ec2_security_group_validator], "update_policy": UpdatePolicy.SUPPORTED - }, - "compute_subnet_id": { + }), + ("compute_subnet_id", { "cfn_param_mapping": "ComputeSubnetId", "allowed_values": ALLOWED_VALUES["subnet_id"], "validators": [ec2_subnet_id_validator], "update_policy": UpdatePolicy.COMPUTE_FLEET_STOP - }, - "compute_subnet_cidr": { + }), + ("compute_subnet_cidr", { "cfn_param_mapping": "ComputeSubnetCidr", "allowed_values": ALLOWED_VALUES["cidr"], "update_policy": UpdatePolicy.UNSUPPORTED - }, - "use_public_ips": { + }), + ("use_public_ips", { "type": BoolCfnParam, "default": True, "cfn_param_mapping": "UsePublicIps", "update_policy": UpdatePolicy.COMPUTE_FLEET_STOP - }, - "vpc_security_group_id": { + }), + ("vpc_security_group_id", { "cfn_param_mapping": "VPCSecurityGroupId", "allowed_values": ALLOWED_VALUES["security_group_id"], "validators": [ec2_security_group_validator], "update_policy": UpdatePolicy.SUPPORTED - }, - "master_availability_zone": { + }), + ("master_availability_zone", { # NOTE: this is not exposed as a configuration parameter "type": HeadNodeAvailabilityZoneCfnParam, "cfn_param_mapping": "AvailabilityZone", "update_policy": UpdatePolicy.IGNORED, "visibility": Visibility.PRIVATE - }, - "compute_availability_zone": { + }), + ("compute_availability_zone", { # NOTE: this is not exposed as a configuration parameter "type": ComputeAvailabilityZoneCfnParam, "update_policy": UpdatePolicy.IGNORED, "visibility": Visibility.PRIVATE - } - }, + }) + ]), } EBS = { @@ -922,6 +924,7 @@ }), ("vpc_settings", { "type": SettingsCfnParam, + "required": True, "referred_section": VPC, "update_policy": UpdatePolicy.UNSUPPORTED, }), diff --git a/cli/src/pcluster/models/hit/hit_cluster_model.py b/cli/src/pcluster/models/hit/hit_cluster_model.py index 5497dd123a..3b60a7f350 100644 --- a/cli/src/pcluster/models/hit/hit_cluster_model.py +++ b/cli/src/pcluster/models/hit/hit_cluster_model.py @@ -42,7 +42,7 @@ def test_configuration(self, pcluster_config): cluster_section = pcluster_config.get_section("cluster") vpc_section = pcluster_config.get_section("vpc") - if not cluster_section or cluster_section.get_param_value("scheduler") == "awsbatch" or not vpc_section: + if cluster_section.get_param_value("scheduler") == "awsbatch": return head_node_instance_type = cluster_section.get_param_value("master_instance_type") diff --git a/cli/src/pcluster/models/sit/sit_cluster_model.py b/cli/src/pcluster/models/sit/sit_cluster_model.py index 11cc291691..8bdc85f887 100644 --- a/cli/src/pcluster/models/sit/sit_cluster_model.py +++ b/cli/src/pcluster/models/sit/sit_cluster_model.py @@ -60,10 +60,8 @@ def test_configuration(self, pcluster_config): vpc_section = pcluster_config.get_section("vpc") if ( - not cluster_section - or cluster_section.get_param_value("scheduler") == "awsbatch" + cluster_section.get_param_value("scheduler") == "awsbatch" or cluster_section.get_param_value("cluster_type") == "spot" - or not vpc_section ): return diff --git a/cli/tests/pcluster/config/utils.py b/cli/tests/pcluster/config/utils.py index e1bdbacb77..c10ebbd00f 100644 --- a/cli/tests/pcluster/config/utils.py +++ b/cli/tests/pcluster/config/utils.py @@ -12,6 +12,7 @@ import os import shutil import tempfile +from collections import OrderedDict import configparser import pytest @@ -144,23 +145,6 @@ def mock_instance_type_info(mocker, instance_type="t2.micro"): ) -def mock_ec2_key_pair(mocker, cluster_section_dict): - if cluster_section_dict.get("key_name") is None: - cluster_section_dict["key_name"] = "test_key" - - mocker.patch( - "pcluster.config.validators._describe_ec2_key_pair", - return_value={ - "KeyPairs": [ - { - "KeyFingerprint": "12:bf:7c:56:6c:dd:4f:8c:24:45:75:f1:1b:16:54:89:82:09:a4:26", - "KeyName": "test_key", - } - ] - }, - ) - - def assert_param_validator( mocker, config_parser_dict, @@ -168,7 +152,6 @@ def assert_param_validator( capsys=None, expected_warning=None, extra_patches=None, - use_mock_ec2_key_pair=True, ): config_parser = configparser.ConfigParser() @@ -177,7 +160,6 @@ def assert_param_validator( set_default_values_for_required_cluster_section_params( config_parser_dict.get("cluster default"), only_if_not_present=True ) - mock_ec2_key_pair(mocker, config_parser_dict.get("cluster default")) config_parser.read_dict(config_parser_dict) mock_pcluster_config(mocker, config_parser_dict.get("cluster default").get("scheduler"), extra_patches) @@ -411,10 +393,29 @@ def init_pcluster_config_from_configparser(config_parser, validate=True, auto_re config_file=config_file.name, cluster_label="default", fail_on_file_absence=True, auto_refresh=auto_refresh ) if validate: - pcluster_config.validate() + _validate_config(config_parser, pcluster_config) return pcluster_config +def _validate_config(config_parser, pcluster_config): + """Validate sections and params in config_parser by the order specified in the pcluster_config.""" + for section_key in pcluster_config.get_section_keys(): + for section_label in pcluster_config.get_sections(section_key).keys(): + section_name = section_key + " " + section_label if section_label else section_key + if section_name in config_parser.sections(): + pcluster_config_section = pcluster_config.get_section(section_key, section_label) + for validation_func in pcluster_config_section.definition.get("validators", []): + errors, warnings = validation_func(section_key, section_label, pcluster_config) + if errors: + pcluster_config.error(errors) + elif warnings: + pcluster_config.warn(warnings) + config_parser_section = OrderedDict(config_parser.items(section_name)) + for param_key in pcluster_config_section.params: + if param_key in config_parser_section: + pcluster_config_section.get_param(param_key).validate() + + def duplicate_config_file(dst_config_file, test_datadir): # Make a copy of the src template to the target file. # The two resulting PClusterConfig instances will be identical From 6d76408915f587aa7a7f2b6b6e9d633d835d2289 Mon Sep 17 00:00:00 2001 From: Luca Carrogu Date: Mon, 7 Dec 2020 13:53:51 +0100 Subject: [PATCH 33/66] Add support for me-south-1, af-south-1 and eu-south-1 New regions are me-south-1 (Bahrein), af-south-1 (Cape Town) and eu-south-1 (Milan) Distribute the integration test to use these new regions * AWS Lustre and ARM instance type are not yet supported in me-south-1, af-south-1 and eu-south-1 * AWS Batch is not yet supported in af-south-1 * EBS io2 is not yet supported in af-south-1 and eu-south-1 Signed-off-by: Luca Carrogu --- CHANGELOG.md | 5 +++++ tests/integration-tests/configs/common.jinja2 | 2 +- .../configs/common/common.yaml | 22 +++++++++---------- 3 files changed, 17 insertions(+), 12 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a76a5dde29..b075c614f7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,11 @@ CHANGELOG **ENHANCEMENTS** +- Add support for me-south-1 region (Bahrein), af-south-1 region (Cape Town) and eu-south-1 region (Milan) + - At the time of this version launch: + - AWS Lustre and ARM instance type are not supported in me-south-1, af-south-1 and eu-south-1 + - AWS Batch is not supported in af-south-1 + - EBS io2 is not supported in af-south-1 and eu-south-1 - Remove CloudFormation DescribeStacks API call from AWS Batch Docker entrypoint. This removes the possibility of job failures due to CloudFormation throttling. - Add support for io2 EBS volume type. diff --git a/tests/integration-tests/configs/common.jinja2 b/tests/integration-tests/configs/common.jinja2 index e8d05bd703..d73d3ddec8 100644 --- a/tests/integration-tests/configs/common.jinja2 +++ b/tests/integration-tests/configs/common.jinja2 @@ -1,4 +1,4 @@ -{%- set REGIONS_COMMERCIAL = ["us-east-1", "us-east-2", "us-west-1", "us-west-2", "ca-central-1", "eu-central-1", "eu-west-1", "eu-west-2", "eu-west-3", "sa-east-1", "ap-east-1", "ap-northeast-1", "ap-northeast-2", "ap-south-1", "ap-southeast-1", "ap-southeast-2", "eu-north-1"] -%} +{%- set REGIONS_COMMERCIAL = ["us-east-1", "us-east-2", "us-west-1", "us-west-2", "ca-central-1", "eu-central-1", "eu-west-1", "eu-west-2", "eu-west-3", "sa-east-1", "ap-east-1", "ap-northeast-1", "ap-northeast-2", "ap-south-1", "ap-southeast-1", "ap-southeast-2", "eu-north-1", "me-south-1", "af-south-1", "eu-south-1"] -%} {%- set REGIONS_CHINA = ["cn-north-1", "cn-northwest-1"] -%} {%- set REGIONS_GOVCLOUD = ["us-gov-west-1", "us-gov-east-1"] -%} {%- set REGIONS_ALL = REGIONS_COMMERCIAL + REGIONS_CHINA + REGIONS_GOVCLOUD -%} diff --git a/tests/integration-tests/configs/common/common.yaml b/tests/integration-tests/configs/common/common.yaml index f84e873c8d..87a88c0d9c 100644 --- a/tests/integration-tests/configs/common/common.yaml +++ b/tests/integration-tests/configs/common/common.yaml @@ -1,7 +1,7 @@ cfn-init: test_cfn_init.py::test_replace_compute_on_failure: dimensions: - - regions: ["eu-central-1"] + - regions: ["af-south-1"] instances: {{ common.INSTANCES_DEFAULT_X86 }} oss: {{ common.OSS_ONE_PER_DISTRO }} schedulers: ["slurm", "sge"] @@ -77,7 +77,7 @@ configure: create: test_create.py::test_create_wrong_os: dimensions: - - regions: ["eu-central-1"] + - regions: ["af-south-1"] instances: {{ common.INSTANCES_DEFAULT_X86 }} oss: ["ubuntu1804"] # os must be different from centos7 to test os validation logic when wrong os is provided schedulers: ["slurm"] @@ -101,7 +101,7 @@ createami: oss: ["alinux2"] test_createami.py::test_createami_post_install: dimensions: - - regions: ["ap-southeast-2"] + - regions: ["eu-south-1"] instances: {{ common.INSTANCES_DEFAULT_X86 }} oss: ["centos7", "ubuntu1804"] - regions: ["eu-west-1"] @@ -109,7 +109,7 @@ createami: oss: ["alinux2"] test_createami.py::test_createami_wrong_os: dimensions: - - regions: ["eu-central-1"] + - regions: ["eu-south-1"] instances: {{ common.INSTANCES_DEFAULT_X86 }} oss: ["alinux"] # os must be different from alinux2 to test os validation logic when wrong os is provided test_createami.py::test_createami_wrong_pcluster_version: @@ -229,7 +229,7 @@ intel_hpc: networking: test_cluster_networking.py::test_cluster_in_private_subnet: dimensions: - - regions: ["us-west-2"] + - regions: ["me-south-1"] instances: {{ common.INSTANCES_DEFAULT_X86 }} oss: ["alinux2"] schedulers: ["slurm"] @@ -239,10 +239,10 @@ networking: schedulers: ["sge"] test_networking.py::test_public_network_topology: dimensions: - - regions: ["eu-central-1", "us-gov-east-1", "cn-northwest-1"] + - regions: ["af-south-1", "us-gov-east-1", "cn-northwest-1"] test_networking.py::test_public_private_network_topology: dimensions: - - regions: ["eu-central-1", "us-gov-east-1", "cn-northwest-1"] + - regions: ["af-south-1", "us-gov-east-1", "cn-northwest-1"] test_multi_cidr.py::test_multi_cidr: dimensions: - regions: ["ap-northeast-2"] @@ -305,7 +305,7 @@ scaling: schedulers: test_sge.py::test_sge: dimensions: - - regions: ["eu-central-1"] + - regions: ["eu-south-1"] instances: {{ common.INSTANCES_DEFAULT_X86 }} oss: {{ common.OSS_COMMERCIAL_X86 }} schedulers: ["sge"] @@ -356,7 +356,7 @@ schedulers: spot: test_spot.py::test_spot_default: dimensions: - - regions: ["us-west-2"] + - regions: ["me-south-1"] instances: {{ common.INSTANCES_DEFAULT_X86 }} oss: ["centos7"] schedulers: ["sge", "slurm"] @@ -499,13 +499,13 @@ tags: update: test_update.py::test_update_awsbatch: dimensions: - - regions: ["eu-west-1"] + - regions: ["eu-south-1"] instances: {{ common.INSTANCES_DEFAULT_X86 }} oss: ["alinux2"] schedulers: ["awsbatch"] test_update.py::test_update_hit: dimensions: - - regions: ["eu-west-1"] + - regions: ["me-south-1"] instances: {{ common.INSTANCES_DEFAULT_X86 }} oss: {{ common.OSS_COMMERCIAL_X86 }} schedulers: ["slurm"] From 17be921f2eb0098e06dcd36d638b52352f0c36af Mon Sep 17 00:00:00 2001 From: chenwany Date: Fri, 4 Dec 2020 12:01:00 -0800 Subject: [PATCH 34/66] integ-test: test ebs encrypted with kms key Signed-off-by: chenwany --- .../resources/batch_instance_policy.json | 117 ++++++++++ .../resources/key_policy.json | 37 ++++ .../traditional_instance_policy.json | 173 +++++++++++++++ .../tests/storage/kms_key_factory.py | 203 ++++++++++++++++++ .../tests/storage/test_ebs.py | 25 ++- .../test_ebs_single/pcluster.config.ini | 3 + 6 files changed, 556 insertions(+), 2 deletions(-) create mode 100644 tests/integration-tests/resources/batch_instance_policy.json create mode 100644 tests/integration-tests/resources/key_policy.json create mode 100644 tests/integration-tests/resources/traditional_instance_policy.json create mode 100644 tests/integration-tests/tests/storage/kms_key_factory.py diff --git a/tests/integration-tests/resources/batch_instance_policy.json b/tests/integration-tests/resources/batch_instance_policy.json new file mode 100644 index 0000000000..857f043594 --- /dev/null +++ b/tests/integration-tests/resources/batch_instance_policy.json @@ -0,0 +1,117 @@ +{ + "Version": "2012-10-17", + "Statement": [ + { + "Action": [ + "batch:SubmitJob", + "cloudformation:DescribeStacks", + "ecs:ListContainerInstances", + "ecs:DescribeContainerInstances", + "logs:FilterLogEvents", + "s3:PutObject", + "s3:DeleteObject", + "iam:PassRole" + ], + "Resource": [ + "arn:{{ partition }}:batch:{{ region }}:{{ account_id }}:job-definition/*", + "arn:{{ partition }}:batch:{{ region }}:{{ account_id }}:job-definition/*", + "arn:{{ partition }}:batch:{{ region }}:{{ account_id }}:job-queue/*", + "arn:{{ partition }}:cloudformation:{{ region }}:{{ account_id }}:stack/*", + "arn:{{ partition }}:s3:::{{ cluster_bucket_name }}/*", + "arn:{{ partition }}:iam::{{ account_id }}:role/*", + "arn:{{ partition }}:ecs:{{ region }}:{{ account_id }}:cluster/*", + "arn:{{ partition }}:ecs:{{ region }}:{{ account_id }}:container-instance/*", + "arn:{{ partition }}:logs:{{ region }}:{{ account_id }}:log-group:/aws/batch/job:log-stream:*" + ], + "Effect": "Allow" + }, + { + "Action": [ + "batch:RegisterJobDefinition", + "logs:GetLogEvents" + ], + "Resource": [ + "*" + ], + "Effect": "Allow" + }, + { + "Action": [ + "s3:Get*" + ], + "Resource": [ + "arn:{{ partition }}:s3:::{{ cluster_bucket_name }}/*" + ], + "Effect": "Allow" + }, + { + "Action": [ + "s3:List*" + ], + "Resource": [ + "arn:{{ partition }}:s3:::{{ cluster_bucket_name }}" + ], + "Effect": "Allow" + }, + { + "Action": [ + "batch:DescribeJobQueues", + "batch:TerminateJob", + "batch:DescribeJobs", + "batch:CancelJob", + "batch:DescribeJobDefinitions", + "batch:ListJobs", + "batch:DescribeComputeEnvironments" + ], + "Resource": [ + "*" + ], + "Effect": "Allow" + }, + { + "Action": [ + "ec2:DescribeInstances", + "ec2:AttachVolume", + "ec2:DescribeVolumes", + "ec2:DescribeInstanceAttribute" + ], + "Resource": [ + "*" + ], + "Effect": "Allow", + "Sid": "EC2" + }, + { + "Action": [ + "cloudformation:DescribeStackResource", + "cloudformation:SignalResource" + ], + "Resource": [ + "*" + ], + "Effect": "Allow", + "Sid": "CloudFormation" + }, + { + "Action": [ + "fsx:DescribeFileSystems" + ], + "Resource": [ + "*" + ], + "Effect": "Allow", + "Sid": "FSx" + }, + { + "Action": [ + "logs:CreateLogGroup", + "logs:CreateLogStream" + ], + "Resource": [ + "*" + ], + "Effect": "Allow", + "Sid": "CWLogs" + } + ] +} \ No newline at end of file diff --git a/tests/integration-tests/resources/key_policy.json b/tests/integration-tests/resources/key_policy.json new file mode 100644 index 0000000000..579f3bc0c8 --- /dev/null +++ b/tests/integration-tests/resources/key_policy.json @@ -0,0 +1,37 @@ +{ + "Version": "2012-10-17", + "Statement": [ + { + "Sid": "Enable IAM User Permissions", + "Effect": "Allow", + "Principal": {"AWS": "arn:{{ partition }}:iam::{{ account_id }}:root"}, + "Action": "kms:*", + "Resource": "*" + }, + { + "Sid": "Allow use of the key", + "Effect": "Allow", + "Principal": { + "AWS": "arn:{{ partition }}:iam::{{ account_id }}:role/{{ iam_role_name }}" + }, + "Action": [ + "kms:Encrypt", + "kms:Decrypt", + "kms:ReEncrypt*", + "kms:GenerateDataKey*", + "kms:DescribeKey" + ], + "Resource": "*" + }, + { + "Sid": "Allow attachment of persistent resources", + "Effect": "Allow", + "Principal": { + "AWS": "arn:{{ partition }}:iam::{{ account_id }}:role/{{ iam_role_name }}" + }, + "Action": ["kms:CreateGrant", "kms:ListGrants", "kms:RevokeGrant"], + "Resource": "*", + "Condition": {"Bool": {"kms:GrantIsForAWSResource": "true"}} + } + ] +} \ No newline at end of file diff --git a/tests/integration-tests/resources/traditional_instance_policy.json b/tests/integration-tests/resources/traditional_instance_policy.json new file mode 100644 index 0000000000..b9c826f56d --- /dev/null +++ b/tests/integration-tests/resources/traditional_instance_policy.json @@ -0,0 +1,173 @@ +{ + "Version": "2012-10-17", + "Statement": [ + { + "Action": [ + "ec2:DescribeVolumes", + "ec2:AttachVolume", + "ec2:DescribeInstanceAttribute", + "ec2:DescribeInstanceStatus", + "ec2:DescribeInstances", + "ec2:DescribeInstanceTypes", + "ec2:DescribeRegions", + "ec2:RunInstances", + "ec2:TerminateInstances", + "ec2:DescribeLaunchTemplates", + "ec2:CreateTags" + ], + "Resource": [ + "*" + ], + "Effect": "Allow", + "Sid": "EC2" + }, + { + "Action": [ + "dynamodb:ListTables" + ], + "Resource": [ + "*" + ], + "Effect": "Allow", + "Sid": "DynamoDBList" + }, + { + "Action": [ + "sqs:SendMessage", + "sqs:ReceiveMessage", + "sqs:ChangeMessageVisibility", + "sqs:DeleteMessage", + "sqs:GetQueueUrl" + ], + "Resource": [ + "arn:{{ partition }}:sqs:{{ region }}:{{ account_id }}:parallelcluster-*" + ], + "Effect": "Allow", + "Sid": "SQSQueue" + }, + { + "Action": [ + "autoscaling:DescribeAutoScalingGroups", + "autoscaling:TerminateInstanceInAutoScalingGroup", + "autoscaling:SetDesiredCapacity", + "autoscaling:UpdateAutoScalingGroup", + "autoscaling:DescribeTags", + "autoscaling:SetInstanceHealth" + ], + "Resource": [ + "*" + ], + "Effect": "Allow", + "Sid": "Autoscaling" + }, + { + "Action": [ + "cloudformation:DescribeStacks", + "cloudformation:DescribeStackResource", + "cloudformation:SignalResource" + ], + "Resource": [ + "arn:{{ partition }}:cloudformation:{{ region }}:{{ account_id }}:stack/parallelcluster-*/*" + ], + "Effect": "Allow", + "Sid": "CloudFormation" + }, + { + "Action": [ + "dynamodb:PutItem", + "dynamodb:Query", + "dynamodb:GetItem", + "dynamodb:BatchWriteItem", + "dynamodb:DeleteItem", + "dynamodb:DescribeTable" + ], + "Resource": [ + "arn:{{ partition }}:dynamodb:{{ region }}:{{ account_id }}:table/parallelcluster-*" + ], + "Effect": "Allow", + "Sid": "DynamoDBTable" + }, + { + "Action": [ + "s3:GetObject" + ], + "Resource": [ + "arn:{{ partition }}:s3:::{{ region }}-aws-parallelcluster/*" + ], + "Effect": "Allow", + "Sid": "S3GetObj" + }, + { + "Action": [ + "sqs:ListQueues" + ], + "Resource": [ + "*" + ], + "Effect": "Allow", + "Sid": "SQSList" + }, + { + "Action": [ + "iam:PassRole" + ], + "Resource": [ + "*" + ], + "Effect": "Allow", + "Sid": "IAMPassRole" + }, + { + "Action": [ + "s3:GetObject" + ], + "Resource": [ + "arn:{{ partition }}:s3:::dcv-license.{{ region }}/*" + ], + "Effect": "Allow", + "Sid": "DcvLicense" + }, + { + "Action": [ + "s3:GetObject", + "s3:GetObjectVersion" + ], + "Resource": [ + "arn:{{ partition }}:s3:::{{ cluster_bucket_name }}/*" + ], + "Effect": "Allow", + "Sid": "GetClusterConfig" + }, + { + "Action": [ + "fsx:DescribeFileSystems" + ], + "Resource": [ + "*" + ], + "Effect": "Allow", + "Sid": "FSx" + }, + { + "Action": [ + "logs:CreateLogStream", + "logs:PutLogEvents" + ], + "Resource": [ + "*" + ], + "Effect": "Allow", + "Sid": "CWLogs" + }, + { + "Action": [ + "route53:ChangeResourceRecordSets" + ], + "Resource": [ + "arn:{{ partition }}:route53:::hostedzone/*" + ], + "Effect": "Allow", + "Sid": "Route53" + } + ] +} \ No newline at end of file diff --git a/tests/integration-tests/tests/storage/kms_key_factory.py b/tests/integration-tests/tests/storage/kms_key_factory.py new file mode 100644 index 0000000000..cb6ee06d26 --- /dev/null +++ b/tests/integration-tests/tests/storage/kms_key_factory.py @@ -0,0 +1,203 @@ +import json +import logging +import random +import string +import time + +import boto3 +import pkg_resources +from jinja2 import Environment, FileSystemLoader + + +class KMSKeyFactory: + """Manage creation for kms key.""" + + def __init__(self): + self.iam_client = None + self.kms_client = None + self.kms_key_id = None + self.account_id = boto3.client("sts").get_caller_identity().get("Account") + self.region = None + self.partition = None + self.iam_role = None + self.iam_policy_arn_batch = None + self.iam_policy_arn_traditional = None + + def create_kms_key(self, region): + """ + Create a kms key with given region. + :param region: Different region need to create different keys + """ + self.region = region + if self.kms_key_id: + return self.kms_key_id + + self.iam_role = self._create_role(region) + self.kms_key_id = self._create_kms_key(region) + return self.kms_key_id + + def _create_role(self, region): + """ + Create iam role in given region. + :param region: Create different roles on different regions, since we need to attach different policies + """ + random_string = "".join(random.choice(string.ascii_lowercase + string.digits) for _ in range(8)) + iam_role_name = "Integration_test_ParallelClusterInstanceRole_{0}_{1}".format(self.region, random_string) + + iam_policy_name_batch = "".join("Integ_test_ParallelClusterInstancePolicy_batch" + random_string) + logging.info("iam policy for awsbatch is {0}".format(iam_policy_name_batch)) + iam_policy_name_traditional = "".join("Integ_test_ParallelClusterInstancePolicy" + random_string) + logging.info("iam_policy for traditional scheduler is {0}".format(iam_policy_name_traditional)) + + self.iam_client = boto3.client("iam", region_name=region) + + # Create the iam role + logging.info("creating iam role {0} for creating KMS key...".format(iam_role_name)) + + self.partition = next( + ("aws-" + partition for partition in ["us-gov", "cn"] if self.region.startswith(partition)), "aws" + ) + domain_suffix = ".cn" if self.partition == "aws-cn" else "" + + # Add EC2 as trust entity of the IAM role + trust_relationship_policy_ec2 = { + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Principal": {"Service": "ec2.amazonaws.com{0}".format(domain_suffix)}, + "Action": "sts:AssumeRole", + } + ], + } + self.iam_client.create_role( + RoleName=iam_role_name, + AssumeRolePolicyDocument=json.dumps(trust_relationship_policy_ec2), + Description="Role for create custom KMS key", + ) + # Having time.sleep here because because it take a while for the the IAM role to become valid for use in the + # put_key_policy step for creating KMS key, read the following link for reference : + # https://stackoverflow.com/questions/20156043/how-long-should-i-wait-after-applying-an-aws-iam-policy-before-it-is-valid + time.sleep(15) + + # create instance policies for awsbatch and traditional schedulers + self.iam_policy_arn_batch = self._create_iam_policies(iam_policy_name_batch, "awsbatch") + self.iam_policy_arn_traditional = self._create_iam_policies(iam_policy_name_traditional, "traditional") + + # attach the Instance policies to the role + logging.info("Attaching iam policy to the role {0}...".format(iam_role_name)) + + # attach the Instance policy for awsBatch + self.iam_client.attach_role_policy(RoleName=iam_role_name, PolicyArn=self.iam_policy_arn_batch) + + # attach the Instance policy for traditional scheduler + self.iam_client.attach_role_policy(RoleName=iam_role_name, PolicyArn=self.iam_policy_arn_traditional) + + logging.info("Iam role is ready: {0}".format(iam_role_name)) + return iam_role_name + + def _create_iam_policies(self, iam_policy_name, scheduler): + # the param "scheduler" here can have the value "awsbatch" and "traditional" + + # create the iam policy + # for different scheduler, attach different instance policy + logging.info("Creating iam policy {0} for iam role...".format(iam_policy_name)) + file_loader = FileSystemLoader(pkg_resources.resource_filename(__name__, "/../../resources")) + env = Environment(loader=file_loader, trim_blocks=True, lstrip_blocks=True) + policy_filename = ( + "batch_instance_policy.json" if scheduler == "awsbatch" else "traditional_instance_policy.json" + ) + parallel_cluster_instance_policy = env.get_template(policy_filename).render( + partition=self.partition, + region=self.region, + account_id=self.account_id, + cluster_bucket_name="parallelcluster-*", + ) + + policy_res = self.iam_client.create_policy( + PolicyName=iam_policy_name, PolicyDocument=parallel_cluster_instance_policy + ) + policy_arn = policy_res["Policy"]["Arn"] + return policy_arn + + def _create_kms_key(self, region): + # create KMS key + self.kms_client = boto3.client("kms", region_name=region) + random_string = "".join(random.choice(string.ascii_lowercase + string.digits) for _ in range(8)) + key_alias = "alias/Integration_test_KMS_key_{0}_{1}".format(self.region, random_string) + + # If the key already existed, use the existing key + for alias in self.kms_client.list_aliases().get("Aliases"): + if alias.get("AliasName") == key_alias: + kms_key_id = alias.get("TargetKeyId") + logging.info("Use existing KMS key {0}".format(kms_key_id)) + return kms_key_id + + # if the key doesn't existed in the account, create a new key + logging.info("Creating KMS key...") + response = self.kms_client.create_key( + Description="create kms key", + KeyUsage="ENCRYPT_DECRYPT", + Origin="AWS_KMS", + BypassPolicyLockoutSafetyCheck=False, + ) + kms_key_id = response["KeyMetadata"]["KeyId"] + + # create KMS key policy + logging.info("Attaching key policy...") + file_loader = FileSystemLoader(pkg_resources.resource_filename(__name__, "/../../resources")) + env = Environment(loader=file_loader, trim_blocks=True, lstrip_blocks=True) + key_policy = env.get_template("key_policy.json").render( + partition=self.partition, account_id=self.account_id, iam_role_name=self.iam_role + ) + + # attach key policy to the key + logging.info("Kms key {0} is ".format(kms_key_id)) + # poll_on_key_creation(kms_key_id, self.kms_client) + self.kms_client.put_key_policy( + KeyId=kms_key_id, + Policy=key_policy, + PolicyName="default", + ) + + # create alias for the key + self.kms_client.create_alias( + AliasName=key_alias, + TargetKeyId=kms_key_id, + ) + logging.info("Kms key {0} is ready".format(kms_key_id)) + return kms_key_id + + def release_all(self): + """Release all resources""" + self._release_iam_policy() + self._release_iam_role() + self._release_kms_key() + + def _release_iam_policy(self): + if self.iam_policy_arn_batch or self.iam_policy_arn_traditional: + logging.info("Deleting iam policy for awsbatch %s" % self.iam_policy_arn_batch) + # detach iam policy for awsbatch from iam role + self.iam_client.detach_role_policy(RoleName=self.iam_role, PolicyArn=self.iam_policy_arn_batch) + # delete the awsbatch policy + self.iam_client.delete_policy(PolicyArn=self.iam_policy_arn_batch) + logging.info("Deleting iam policy for traditional scheduler %s" % self.iam_policy_arn_traditional) + # detach iam policy for traditional schedluer from iam role + self.iam_client.detach_role_policy(RoleName=self.iam_role, PolicyArn=self.iam_policy_arn_traditional) + # delete the traditional schedluer policy + self.iam_client.delete_policy(PolicyArn=self.iam_policy_arn_traditional) + + def _release_iam_role(self): + logging.info("Deleting iam role %s" % self.iam_role) + self.iam_client.delete_role( + RoleName=self.iam_role, + ) + + def _release_kms_key(self): + logging.info("Scheduling delete Kms key %s" % self.iam_role) + self.kms_client.schedule_key_deletion( + KeyId=self.kms_key_id, + # The waiting period, specified in number of days. After the waiting period ends, AWS KMS deletes the CMK. + # The waiting period is at least 7 days. + PendingWindowInDays=7, + ) diff --git a/tests/integration-tests/tests/storage/test_ebs.py b/tests/integration-tests/tests/storage/test_ebs.py index 495baaf2b5..48a5cf1553 100644 --- a/tests/integration-tests/tests/storage/test_ebs.py +++ b/tests/integration-tests/tests/storage/test_ebs.py @@ -18,6 +18,7 @@ from remote_command_executor import RemoteCommandExecutor from tests.common.schedulers_common import get_scheduler_commands +from tests.storage.kms_key_factory import KMSKeyFactory from tests.storage.snapshots_factory import EBSSnapshotsFactory from tests.storage.storage_common import verify_directory_correctly_shared @@ -26,16 +27,22 @@ @pytest.mark.instances(["c4.xlarge", "c5.xlarge"]) @pytest.mark.schedulers(["sge"]) @pytest.mark.usefixtures("region", "os", "instance") -def test_ebs_single(scheduler, pcluster_config_reader, clusters_factory): +def test_ebs_single(scheduler, pcluster_config_reader, clusters_factory, kms_key_factory, region): mount_dir = "ebs_mount_dir" - cluster_config = pcluster_config_reader(mount_dir=mount_dir) + kms_key_id = kms_key_factory.create_kms_key(region) + cluster_config = pcluster_config_reader( + mount_dir=mount_dir, ec2_iam_role=kms_key_factory.iam_role, ebs_kms_key_id=kms_key_id + ) cluster = clusters_factory(cluster_config) remote_command_executor = RemoteCommandExecutor(cluster) mount_dir = "/" + mount_dir scheduler_commands = get_scheduler_commands(scheduler, remote_command_executor) + volume_id = get_ebs_volume_ids(cluster, region) + _test_ebs_correctly_mounted(remote_command_executor, mount_dir, volume_size=20) _test_ebs_correctly_shared(remote_command_executor, mount_dir, scheduler_commands) + _test_ebs_encrypted_with_kms(volume_id, region, kms_key_id) @pytest.mark.dimensions("ap-northeast-2", "c5.xlarge", "alinux2", "sge") @@ -258,8 +265,22 @@ def _assert_volume_exist(volume_id, region): assert_that(volume_status).is_equal_to("available") +def _test_ebs_encrypted_with_kms(volume_id, region, kms_key_id): + logging.info("Getting Encrypted information from DescribeVolumes API.") + volume_info = boto3.client("ec2", region_name=region).describe_volumes(VolumeIds=volume_id).get("Volumes")[0] + assert_that(volume_info.get("Encrypted")).is_true() + assert_that(volume_info.get("KmsKeyId")).matches(kms_key_id) + + @pytest.fixture() def snapshots_factory(): factory = EBSSnapshotsFactory() yield factory factory.release_all() + + +@pytest.fixture(scope="module") +def kms_key_factory(): + factory = KMSKeyFactory() + yield factory + factory.release_all() diff --git a/tests/integration-tests/tests/storage/test_ebs/test_ebs_single/pcluster.config.ini b/tests/integration-tests/tests/storage/test_ebs/test_ebs_single/pcluster.config.ini index 0e5b8c2fec..83b6cc2b0f 100644 --- a/tests/integration-tests/tests/storage/test_ebs/test_ebs_single/pcluster.config.ini +++ b/tests/integration-tests/tests/storage/test_ebs/test_ebs_single/pcluster.config.ini @@ -19,6 +19,7 @@ initial_queue_size = 1 maintain_initial_size = true {% endif %} ebs_settings = ebs +ec2_iam_role = {{ ec2_iam_role }} [vpc parallelcluster-vpc] vpc_id = {{ vpc_id }} @@ -30,3 +31,5 @@ use_public_ips = false shared_dir = {{ mount_dir }} volume_type = io1 volume_iops = 210 +encrypted = true +ebs_kms_key_id = {{ ebs_kms_key_id }} \ No newline at end of file From 7ab48c1e88838051b2b0ef8b1f97dedcb4164e25 Mon Sep 17 00:00:00 2001 From: Luca Carrogu Date: Tue, 8 Dec 2020 08:51:18 +0100 Subject: [PATCH 35/66] Move tests from regions where there are not the used test resources * t2 isn't present in all the regions * released AMI aren't yet present in all the regions Signed-off-by: Luca Carrogu --- tests/integration-tests/configs/common/common.yaml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/integration-tests/configs/common/common.yaml b/tests/integration-tests/configs/common/common.yaml index 87a88c0d9c..4b4d51de5d 100644 --- a/tests/integration-tests/configs/common/common.yaml +++ b/tests/integration-tests/configs/common/common.yaml @@ -77,7 +77,7 @@ configure: create: test_create.py::test_create_wrong_os: dimensions: - - regions: ["af-south-1"] + - regions: ["eu-central-1"] instances: {{ common.INSTANCES_DEFAULT_X86 }} oss: ["ubuntu1804"] # os must be different from centos7 to test os validation logic when wrong os is provided schedulers: ["slurm"] @@ -101,7 +101,7 @@ createami: oss: ["alinux2"] test_createami.py::test_createami_post_install: dimensions: - - regions: ["eu-south-1"] + - regions: ["ap-southeast-2"] instances: {{ common.INSTANCES_DEFAULT_X86 }} oss: ["centos7", "ubuntu1804"] - regions: ["eu-west-1"] @@ -109,7 +109,7 @@ createami: oss: ["alinux2"] test_createami.py::test_createami_wrong_os: dimensions: - - regions: ["eu-south-1"] + - regions: ["eu-central-1"] instances: {{ common.INSTANCES_DEFAULT_X86 }} oss: ["alinux"] # os must be different from alinux2 to test os validation logic when wrong os is provided test_createami.py::test_createami_wrong_pcluster_version: @@ -505,7 +505,7 @@ update: schedulers: ["awsbatch"] test_update.py::test_update_hit: dimensions: - - regions: ["me-south-1"] + - regions: ["eu-west-1"] instances: {{ common.INSTANCES_DEFAULT_X86 }} oss: {{ common.OSS_COMMERCIAL_X86 }} schedulers: ["slurm"] From 2bde0a88dd9d73c102f5c4c1f5077cc97f274e4d Mon Sep 17 00:00:00 2001 From: chenwany Date: Tue, 8 Dec 2020 09:27:18 -0800 Subject: [PATCH 36/66] Fix ebs create with kms key test Add endpint url for boto3 call for different region and partition Signed-off-by: chenwany --- .../tests/storage/kms_key_factory.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/tests/integration-tests/tests/storage/kms_key_factory.py b/tests/integration-tests/tests/storage/kms_key_factory.py index cb6ee06d26..7c547d967e 100644 --- a/tests/integration-tests/tests/storage/kms_key_factory.py +++ b/tests/integration-tests/tests/storage/kms_key_factory.py @@ -16,7 +16,7 @@ def __init__(self): self.iam_client = None self.kms_client = None self.kms_key_id = None - self.account_id = boto3.client("sts").get_caller_identity().get("Account") + self.account_id = None self.region = None self.partition = None self.iam_role = None @@ -29,6 +29,10 @@ def create_kms_key(self, region): :param region: Different region need to create different keys """ self.region = region + self.account_id = ( + boto3.client("sts", endpoint_url=_get_sts_endpoint(region)).get_caller_identity().get("Account") + ) + if self.kms_key_id: return self.kms_key_id @@ -201,3 +205,8 @@ def _release_kms_key(self): # The waiting period is at least 7 days. PendingWindowInDays=7, ) + + +def _get_sts_endpoint(region): + """Get regionalized STS endpoint.""" + return "https://sts.{0}.{1}".format(region, "amazonaws.com.cn" if region.startswith("cn-") else "amazonaws.com") From 8ea9d396bf8eaa2e68cb1c25897165f59f96301f Mon Sep 17 00:00:00 2001 From: Luca Carrogu Date: Wed, 9 Dec 2020 10:29:10 +0100 Subject: [PATCH 37/66] Remove explicit call to set/unset credentials in CfnStacksFactory.create_stack set/unset credentials are called by the fixture setup_sts_credentials which has the autouse property set to true The set credentials is moved to the _create_vpc_stack method A check to avoid credentials being set twice is implemented Also, add use of fixture setup_sts_credentials to networking test, so that operation done against the stack are enclosed in lifecycle of the credentials. What the change solve? Avoid to setting credentials when they are already set Signed-off-by: Luca Carrogu --- tests/integration-tests/cfn_stacks_factory.py | 38 ++++++++----------- tests/integration-tests/conftest.py | 25 +++++++----- .../tests/networking/test_networking.py | 1 + tests/integration-tests/utils.py | 10 +++++ 4 files changed, 42 insertions(+), 32 deletions(-) diff --git a/tests/integration-tests/cfn_stacks_factory.py b/tests/integration-tests/cfn_stacks_factory.py index f721212419..65a78b4499 100644 --- a/tests/integration-tests/cfn_stacks_factory.py +++ b/tests/integration-tests/cfn_stacks_factory.py @@ -64,30 +64,24 @@ def create_stack(self, stack): """ name = stack.name region = stack.region - try: - set_credentials(region, self.__credentials) - id = self.__get_stack_internal_id(name, region) - if id in self.__created_stacks: - raise ValueError("Stack {0} already exists in region {1}".format(name, region)) + id = self.__get_stack_internal_id(name, region) + if id in self.__created_stacks: + raise ValueError("Stack {0} already exists in region {1}".format(name, region)) - logging.info("Creating stack {0} in region {1}".format(name, region)) - self.__created_stacks[id] = stack - try: - cfn_client = boto3.client("cloudformation", region_name=region) - result = cfn_client.create_stack( - StackName=name, TemplateBody=stack.template, Parameters=stack.parameters - ) - stack.cfn_stack_id = result["StackId"] - final_status = self.__wait_for_stack_creation(stack.cfn_stack_id, cfn_client) - self.__assert_stack_status(final_status, "CREATE_COMPLETE") - except Exception as e: - logging.error("Creation of stack {0} in region {1} failed with exception: {2}".format(name, region, e)) - raise - - logging.info("Stack {0} created successfully in region {1}".format(name, region)) - finally: - unset_credentials() + logging.info("Creating stack {0} in region {1}".format(name, region)) + self.__created_stacks[id] = stack + try: + cfn_client = boto3.client("cloudformation", region_name=region) + result = cfn_client.create_stack(StackName=name, TemplateBody=stack.template, Parameters=stack.parameters) + stack.cfn_stack_id = result["StackId"] + final_status = self.__wait_for_stack_creation(stack.cfn_stack_id, cfn_client) + self.__assert_stack_status(final_status, "CREATE_COMPLETE") + except Exception as e: + logging.error("Creation of stack {0} in region {1} failed with exception: {2}".format(name, region, e)) + raise + + logging.info("Stack {0} created successfully in region {1}".format(name, region)) @retry( stop_max_attempt_number=10, diff --git a/tests/integration-tests/conftest.py b/tests/integration-tests/conftest.py index 55babf21f7..50cdac7782 100644 --- a/tests/integration-tests/conftest.py +++ b/tests/integration-tests/conftest.py @@ -631,16 +631,21 @@ def vpc_stack(vpc_stacks, region): retry_on_exception=lambda exception: not isinstance(exception, KeyboardInterrupt), ) def _create_vpc_stack(request, template, region, cfn_stacks_factory): - if request.config.getoption("vpc_stack"): - logging.info("Using stack {0} in region {1}".format(request.config.getoption("vpc_stack"), region)) - stack = CfnStack(name=request.config.getoption("vpc_stack"), region=region, template=template.to_json()) - else: - stack = CfnStack( - name=generate_stack_name("integ-tests-vpc", request.config.getoption("stackname_suffix")), - region=region, - template=template.to_json(), - ) - cfn_stacks_factory.create_stack(stack) + try: + set_credentials(region, request.config.getoption("credential")) + if request.config.getoption("vpc_stack"): + logging.info("Using stack {0} in region {1}".format(request.config.getoption("vpc_stack"), region)) + stack = CfnStack(name=request.config.getoption("vpc_stack"), region=region, template=template.to_json()) + else: + stack = CfnStack( + name=generate_stack_name("integ-tests-vpc", request.config.getoption("stackname_suffix")), + region=region, + template=template.to_json(), + ) + cfn_stacks_factory.create_stack(stack) + + finally: + unset_credentials() return stack diff --git a/tests/integration-tests/tests/networking/test_networking.py b/tests/integration-tests/tests/networking/test_networking.py index 227ef3dc89..144b0fcee5 100644 --- a/tests/integration-tests/tests/networking/test_networking.py +++ b/tests/integration-tests/tests/networking/test_networking.py @@ -19,6 +19,7 @@ @pytest.fixture() +@pytest.mark.usefixtures("setup_sts_credentials") def networking_stack_factory(request): """Define a fixture to manage the creation and destruction of CloudFormation stacks.""" factory = CfnStacksFactory(request.config.getoption("credential")) diff --git a/tests/integration-tests/utils.py b/tests/integration-tests/utils.py index 2cdb18196c..cfce9d9c0b 100644 --- a/tests/integration-tests/utils.py +++ b/tests/integration-tests/utils.py @@ -232,6 +232,10 @@ def set_credentials(region, credential_arg): :param region: region of the bucket :param credential_arg: credential list """ + if os.environ.get("AWS_CREDENTIALS_FOR_REGION", "no_region") == region: + logging.info(f"AWS credentials are already set for region: {region}") + return + if credential_arg: # credentials = dict { region1: (endpoint1, arn1, external_id1), # region2: (endpoint2, arn2, external_id2), @@ -252,6 +256,8 @@ def set_credentials(region, credential_arg): credential_endpoint, credential_arn, credential_external_id, region ) + logging.info(f"Setting AWS credentials for region: {region}") + # Set credential for all boto3 client boto3.setup_default_session( aws_access_key_id=aws_credentials["AccessKeyId"], @@ -263,6 +269,7 @@ def set_credentials(region, credential_arg): os.environ["AWS_ACCESS_KEY_ID"] = aws_credentials["AccessKeyId"] os.environ["AWS_SECRET_ACCESS_KEY"] = aws_credentials["SecretAccessKey"] os.environ["AWS_SESSION_TOKEN"] = aws_credentials["SessionToken"] + os.environ["AWS_CREDENTIALS_FOR_REGION"] = region def _retrieve_sts_credential(credential_endpoint, credential_arn, credential_external_id, region): @@ -283,6 +290,7 @@ def _retrieve_sts_credential(credential_endpoint, credential_arn, credential_ext def unset_credentials(): """Unset credentials""" # Unset credential for all boto3 client + logging.info("Unsetting AWS credentials") boto3.setup_default_session() # Unset credential for cli command e.g. pcluster create if "AWS_ACCESS_KEY_ID" in os.environ: @@ -291,6 +299,8 @@ def unset_credentials(): del os.environ["AWS_SECRET_ACCESS_KEY"] if "AWS_SESSION_TOKEN" in os.environ: del os.environ["AWS_SESSION_TOKEN"] + if "AWS_CREDENTIALS_FOR_REGION" in os.environ: + del os.environ["AWS_CREDENTIALS_FOR_REGION"] def set_logger_formatter(formatter): From 2d54dea909af07bae525be142938f1f2196214b4 Mon Sep 17 00:00:00 2001 From: chenwany Date: Sun, 6 Dec 2020 14:34:53 -0800 Subject: [PATCH 38/66] Revert "awsbatch: download AmazonLinux image from ECR rather than Docker Hub" --- CHANGELOG.md | 3 +- .../resources/batch/docker/buildspec.yml | 6 +- .../batch/docker/pull-alinux-image.sh | 22 ----- .../batch/docker/upload-docker-images.sh | 17 ++-- cloudformation/batch-substack.cfn.json | 82 ------------------- tests/integration-tests/clusters_factory.py | 13 --- .../tests/schedulers/test_awsbatch.py | 29 +------ 7 files changed, 15 insertions(+), 157 deletions(-) delete mode 100755 cli/src/pcluster/resources/batch/docker/pull-alinux-image.sh diff --git a/CHANGELOG.md b/CHANGELOG.md index b075c614f7..cf150a89e0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -17,8 +17,7 @@ CHANGELOG **CHANGES** -- Pull Amazon Linux Docker images from ECR when building docker image for `awsbatch` scheduler. This only applies to - images built for `x86` architecture. +- Pull Amazon Linux Docker images from ECR when building docker image for `awsbatch` scheduler. - Use inclusive language in user facing messages and internal naming convention. - Change the default of instance types from the hardcoded `t2.micro` to the free tier instance type (`t2.micro` or `t3.micro` dependent on region). In regions without free tier, the default is `t3.micro`. diff --git a/cli/src/pcluster/resources/batch/docker/buildspec.yml b/cli/src/pcluster/resources/batch/docker/buildspec.yml index 02c84ef843..2d9c5a6bb5 100644 --- a/cli/src/pcluster/resources/batch/docker/buildspec.yml +++ b/cli/src/pcluster/resources/batch/docker/buildspec.yml @@ -1,9 +1,13 @@ version: 0.2 phases: + install: + runtime-versions: + docker: 18 pre_build: commands: - - sh ./pull-alinux-image.sh + - echo Logging in to Amazon ECR... + - $(aws ecr get-login --no-include-email --region $AWS_DEFAULT_REGION) build: commands: - echo Build started on `date` diff --git a/cli/src/pcluster/resources/batch/docker/pull-alinux-image.sh b/cli/src/pcluster/resources/batch/docker/pull-alinux-image.sh deleted file mode 100755 index f07ba6a52f..0000000000 --- a/cli/src/pcluster/resources/batch/docker/pull-alinux-image.sh +++ /dev/null @@ -1,22 +0,0 @@ -#!/usr/bin/env bash -set -euxo pipefail - -pull_docker_image_from_ecr() { - echo "Pulling amazonlinux:2 image from ECR" - aws ecr get-login-password --region "${ALINUX_ECR_REGISTRY_REGION}" | docker login --username AWS --password-stdin "${ALINUX_ECR_REGISTRY}" || return 1 - docker pull "${ALINUX_ECR_REGISTRY}/amazonlinux:2" || return 1 - docker tag "${ALINUX_ECR_REGISTRY}/amazonlinux:2" amazonlinux:2 -} - -if [ "${IMAGE}" = "alinux" ] || [ "${IMAGE}" = "alinux2" ]; then - if [ "${ARCHITECTURE}" = "x86_64" ]; then - if pull_docker_image_from_ecr; then - echo "Successfully pulled Amazon Linux image from ECR" - else - echo "Failed when pulling amazonlinux:2 image from ECR. Falling back to Docker Hub" - docker pull amazonlinux:2 - fi - else - docker pull amazonlinux:2 - fi -fi diff --git a/cli/src/pcluster/resources/batch/docker/upload-docker-images.sh b/cli/src/pcluster/resources/batch/docker/upload-docker-images.sh index faddcbe468..1dd138b0d5 100755 --- a/cli/src/pcluster/resources/batch/docker/upload-docker-images.sh +++ b/cli/src/pcluster/resources/batch/docker/upload-docker-images.sh @@ -1,20 +1,17 @@ #!/usr/bin/env bash -set -euxo pipefail - -DOMAIN_SUFFIX="" -if [[ ${AWS_REGION} == cn-* ]]; then - DOMAIN_SUFFIX=".cn" -fi +set -eu push_docker_image() { local image=$1 echo "Uploading image ${image}" - docker tag "${IMAGE_REPO_NAME}:${image}" "${AWS_ACCOUNT_ID}.dkr.ecr.${AWS_REGION}.amazonaws.com${DOMAIN_SUFFIX}/${IMAGE_REPO_NAME}:${image}" - docker push "${AWS_ACCOUNT_ID}.dkr.ecr.${AWS_REGION}.amazonaws.com${DOMAIN_SUFFIX}/${IMAGE_REPO_NAME}:${image}" + S3_SUFFIX="" + if [[ ${AWS_REGION} == cn-* ]]; then + S3_SUFFIX=".cn" + fi + docker tag "${IMAGE_REPO_NAME}:${image}" "${AWS_ACCOUNT_ID}.dkr.ecr.${AWS_REGION}.amazonaws.com${S3_SUFFIX}/${IMAGE_REPO_NAME}:${image}" + docker push "${AWS_ACCOUNT_ID}.dkr.ecr.${AWS_REGION}.amazonaws.com${S3_SUFFIX}/${IMAGE_REPO_NAME}:${image}" } -aws ecr get-login-password --region "${AWS_REGION}" | docker login --username AWS --password-stdin "${AWS_ACCOUNT_ID}.dkr.ecr.${AWS_REGION}.amazonaws.com${DOMAIN_SUFFIX}" - if [ -z "${IMAGE}" ]; then for file in $(find `pwd` -type f -name Dockerfile); do IMAGE_TAG=$(dirname "${file}" | xargs basename) diff --git a/cloudformation/batch-substack.cfn.json b/cloudformation/batch-substack.cfn.json index 728a38549e..4be334dd4d 100644 --- a/cloudformation/batch-substack.cfn.json +++ b/cloudformation/batch-substack.cfn.json @@ -1,23 +1,4 @@ { - "Mappings": { - "AmazonLinuxECR": { - "aws": { - "registry": "137112412989.dkr.ecr.us-east-1.amazonaws.com", - "region": "us-east-1", - "account": "137112412989" - }, - "aws-us-gov": { - "registry": "045324592363.dkr.ecr.us-gov-east-1.amazonaws.com", - "region": "us-gov-east-1", - "account": "045324592363" - }, - "aws-cn": { - "registry": "141808717104.dkr.ecr.cn-north-1.amazonaws.com.cn", - "region": "cn-north-1", - "account": "141808717104" - } - } - }, "Parameters": { "MinvCpus": { "Description": "Min vCPU's for ComputeEnvironment", @@ -674,41 +655,11 @@ "Ref": "OS" } }, - { - "Name": "ARCHITECTURE", - "Value": { - "Ref": "Architecture" - } - }, { "Name": "NOTIFICATION_URL", "Value": { "Ref": "DockerBuildWaitHandle" } - }, - { - "Name": "ALINUX_ECR_REGISTRY", - "Value": { - "Fn::FindInMap": [ - "AmazonLinuxECR", - { - "Ref": "AWS::Partition" - }, - "registry" - ] - } - }, - { - "Name": "ALINUX_ECR_REGISTRY_REGION", - "Value": { - "Fn::FindInMap": [ - "AmazonLinuxECR", - { - "Ref": "AWS::Partition" - }, - "region" - ] - } } ], "Image": { @@ -803,39 +754,6 @@ "Fn::Sub": "arn:${AWS::Partition}:s3:::${ResourcesS3Bucket}/${ArtifactS3RootDirectory}/*" }, "Sid": "S3GetObjectPolicy" - }, - { - "Action": [ - "ecr:BatchGetImage", - "ecr:GetDownloadUrlForLayer" - ], - "Effect": "Allow", - "Resource": { - "Fn::Sub": [ - "arn:${AWS::Partition}:ecr:${alinux_ecr_region}:${alinux_ecr_registry_account}:repository/amazonlinux", - { - "alinux_ecr_region": { - "Fn::FindInMap": [ - "AmazonLinuxECR", - { - "Ref": "AWS::Partition" - }, - "region" - ] - }, - "alinux_ecr_registry_account": { - "Fn::FindInMap": [ - "AmazonLinuxECR", - { - "Ref": "AWS::Partition" - }, - "account" - ] - } - } - ] - }, - "Sid": "AlinuxECRRepoPolicy" } ], "Version": "2012-10-17" diff --git a/tests/integration-tests/clusters_factory.py b/tests/integration-tests/clusters_factory.py index 87585a4673..8643d43746 100644 --- a/tests/integration-tests/clusters_factory.py +++ b/tests/integration-tests/clusters_factory.py @@ -35,7 +35,6 @@ def __init__(self, name, ssh_key, config_file): self.__cfn_resources = None self.__head_node_substack_cfn_resources = None self.__ebs_substack_cfn_resources = None - self.__awsbatch_substack_cfn_resources = None def __repr__(self): attrs = ", ".join(["{key}={value}".format(key=key, value=repr(value)) for key, value in self.__dict__.items()]) @@ -229,18 +228,6 @@ def ebs_substack_cfn_resources(self): ) return self.__ebs_substack_cfn_resources - @property - def awsbatch_substack_cfn_resources(self): - """ - Return the CloudFormation stack resources for the cluster's EBS substack. - Resources are retrieved only once and then cached. - """ - if not self.__awsbatch_substack_cfn_resources: - self.__awsbatch_substack_cfn_resources = retrieve_cfn_resources( - self.cfn_resources.get("AWSBatchStack"), self.region - ) - return self.__awsbatch_substack_cfn_resources - def _reset_cached_properties(self): """Discard cached data.""" self.__cfn_outputs = None diff --git a/tests/integration-tests/tests/schedulers/test_awsbatch.py b/tests/integration-tests/tests/schedulers/test_awsbatch.py index da2d6e5065..50edd288a1 100644 --- a/tests/integration-tests/tests/schedulers/test_awsbatch.py +++ b/tests/integration-tests/tests/schedulers/test_awsbatch.py @@ -9,10 +9,8 @@ # or in the "LICENSE.txt" file accompanying this file. # This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied. # See the License for the specific language governing permissions and limitations under the License. -import json import logging -import boto3 import pytest from assertpy import assert_that from remote_command_executor import RemoteCommandExecutor @@ -27,8 +25,8 @@ @pytest.mark.dimensions("cn-north-1", "c4.xlarge", "alinux2", "awsbatch") @pytest.mark.dimensions("ap-southeast-1", "c5.xlarge", "alinux", "awsbatch") @pytest.mark.dimensions("ap-northeast-1", "m6g.xlarge", "alinux2", "awsbatch") -@pytest.mark.usefixtures("instance", "scheduler") -def test_awsbatch(pcluster_config_reader, clusters_factory, test_datadir, caplog, region, os, architecture): +@pytest.mark.usefixtures("region", "os", "instance", "scheduler") +def test_awsbatch(pcluster_config_reader, clusters_factory, test_datadir, caplog, region): """ Test all AWS Batch related features. @@ -41,35 +39,12 @@ def test_awsbatch(pcluster_config_reader, clusters_factory, test_datadir, caplog remote_command_executor = RemoteCommandExecutor(cluster) timeout = 120 if region.startswith("cn-") else 60 # Longer timeout in china regions due to less reliable networking - _assert_successful_codebuild_build(cluster, region, os, architecture) _test_simple_job_submission(remote_command_executor, test_datadir, timeout) _test_array_submission(remote_command_executor) _test_mnp_submission(remote_command_executor, test_datadir) _test_job_kill(remote_command_executor, timeout) -def _assert_successful_codebuild_build(cluster, region, os, architecture): - logging.info("Verifying docker build completed successfully.") - codebuild_project = cluster.awsbatch_substack_cfn_resources.get("CodeBuildDockerImageBuilderProject") - codebuild_client = boto3.client("codebuild", region_name=region) - logs_client = boto3.client("logs", region_name=region) - - build_ids = codebuild_client.list_builds_for_project(projectName=codebuild_project).get("ids") - assert_that(build_ids).is_length(1) - build = codebuild_client.batch_get_builds(ids=build_ids).get("builds")[0] - assert_that(build["buildStatus"]).is_equal_to("SUCCEEDED") - - # check Amazon Linux image is pulled from ECR - if os.startswith("alinux") and architecture == "x86_64": - response = logs_client.get_log_events( - logGroupName=build["logs"]["groupName"], - logStreamName=build["logs"]["streamName"], - limit=100, - startFromHead=True, - ) - assert_that(json.dumps(response)).contains("Successfully pulled Amazon Linux image from ECR") - - def _test_simple_job_submission(remote_command_executor, test_datadir, timeout): logging.info("Testing inline submission.") _test_job_submission(remote_command_executor, f"awsbsub --vcpus 2 --memory 256 --timeout {timeout} sleep 1") From e5e1367ab99d2542afcb9035f09ef9110c7e12a8 Mon Sep 17 00:00:00 2001 From: chenwany Date: Mon, 7 Dec 2020 11:12:37 -0800 Subject: [PATCH 39/66] awsbatch: download AmazonLinux image from public ECR Pull all the AL docker images from public ECR, including architecure are x86 and ARM Signed-off-by: chenwany --- cli/src/pcluster/resources/batch/docker/alinux2/Dockerfile | 2 +- cli/src/pcluster/resources/batch/docker/buildspec.yml | 3 --- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/cli/src/pcluster/resources/batch/docker/alinux2/Dockerfile b/cli/src/pcluster/resources/batch/docker/alinux2/Dockerfile index 97b6642f47..737e94dcae 100644 --- a/cli/src/pcluster/resources/batch/docker/alinux2/Dockerfile +++ b/cli/src/pcluster/resources/batch/docker/alinux2/Dockerfile @@ -1,4 +1,4 @@ -FROM amazonlinux:2 +FROM public.ecr.aws/amazonlinux/amazonlinux:2 ENV USER root diff --git a/cli/src/pcluster/resources/batch/docker/buildspec.yml b/cli/src/pcluster/resources/batch/docker/buildspec.yml index 2d9c5a6bb5..0f14e5f0fa 100644 --- a/cli/src/pcluster/resources/batch/docker/buildspec.yml +++ b/cli/src/pcluster/resources/batch/docker/buildspec.yml @@ -1,9 +1,6 @@ version: 0.2 phases: - install: - runtime-versions: - docker: 18 pre_build: commands: - echo Logging in to Amazon ECR... From 4b016ad5e49aa2228f0475884d30464168a50195 Mon Sep 17 00:00:00 2001 From: chenwany Date: Wed, 9 Dec 2020 11:05:38 -0800 Subject: [PATCH 40/66] Fix integration test kms key: add region name Add region_name to boto3 call to get account id Signed-off-by: chenwany --- tests/integration-tests/tests/storage/kms_key_factory.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/integration-tests/tests/storage/kms_key_factory.py b/tests/integration-tests/tests/storage/kms_key_factory.py index 7c547d967e..bc01ee5f48 100644 --- a/tests/integration-tests/tests/storage/kms_key_factory.py +++ b/tests/integration-tests/tests/storage/kms_key_factory.py @@ -30,7 +30,9 @@ def create_kms_key(self, region): """ self.region = region self.account_id = ( - boto3.client("sts", endpoint_url=_get_sts_endpoint(region)).get_caller_identity().get("Account") + boto3.client("sts", endpoint_url=_get_sts_endpoint(region), region_name=region) + .get_caller_identity() + .get("Account") ) if self.kms_key_id: From b2bca776be9dc72a79d771420380a7c411f960b6 Mon Sep 17 00:00:00 2001 From: chenwany Date: Mon, 14 Dec 2020 12:16:23 -0800 Subject: [PATCH 41/66] Fix integ_test ebs encrypted with KMS: add policy for chronicle and shorter role name Difine shorter role name and policy name to fix the problem that in some region, the length of role name created in the KMS key exceed its max lenghth 64. Add permission to instance policy in the test to enable download Chronicle from Jenkins bucket. Signed-off-by: chenwany --- .../resources/batch_instance_policy.json | 10 ++++++++++ .../resources/traditional_instance_policy.json | 10 ++++++++++ .../integration-tests/tests/storage/kms_key_factory.py | 8 ++++---- 3 files changed, 24 insertions(+), 4 deletions(-) diff --git a/tests/integration-tests/resources/batch_instance_policy.json b/tests/integration-tests/resources/batch_instance_policy.json index 857f043594..9cc41a740b 100644 --- a/tests/integration-tests/resources/batch_instance_policy.json +++ b/tests/integration-tests/resources/batch_instance_policy.json @@ -112,6 +112,16 @@ ], "Effect": "Allow", "Sid": "CWLogs" + }, + { + "Action": [ + "s3:GetObject" + ], + "Resource": [ + "arn:{{ partition }}:s3:::aws-parallelcluster-jenkins-*" + ], + "Effect": "Allow", + "Sid": "Chronicle" } ] } \ No newline at end of file diff --git a/tests/integration-tests/resources/traditional_instance_policy.json b/tests/integration-tests/resources/traditional_instance_policy.json index b9c826f56d..710a918fa4 100644 --- a/tests/integration-tests/resources/traditional_instance_policy.json +++ b/tests/integration-tests/resources/traditional_instance_policy.json @@ -168,6 +168,16 @@ ], "Effect": "Allow", "Sid": "Route53" + }, + { + "Action": [ + "s3:GetObject" + ], + "Resource": [ + "arn:{{ partition }}:s3:::aws-parallelcluster-jenkins-*" + ], + "Effect": "Allow", + "Sid": "Chronicle" } ] } \ No newline at end of file diff --git a/tests/integration-tests/tests/storage/kms_key_factory.py b/tests/integration-tests/tests/storage/kms_key_factory.py index bc01ee5f48..7d81752a49 100644 --- a/tests/integration-tests/tests/storage/kms_key_factory.py +++ b/tests/integration-tests/tests/storage/kms_key_factory.py @@ -48,11 +48,11 @@ def _create_role(self, region): :param region: Create different roles on different regions, since we need to attach different policies """ random_string = "".join(random.choice(string.ascii_lowercase + string.digits) for _ in range(8)) - iam_role_name = "Integration_test_ParallelClusterInstanceRole_{0}_{1}".format(self.region, random_string) + iam_role_name = "Integ_test_InstanceRole_{0}_{1}".format(self.region, random_string) - iam_policy_name_batch = "".join("Integ_test_ParallelClusterInstancePolicy_batch" + random_string) + iam_policy_name_batch = "".join("Integ_test_InstancePolicy_batch" + random_string) logging.info("iam policy for awsbatch is {0}".format(iam_policy_name_batch)) - iam_policy_name_traditional = "".join("Integ_test_ParallelClusterInstancePolicy" + random_string) + iam_policy_name_traditional = "".join("Integ_test_InstancePolicy" + random_string) logging.info("iam_policy for traditional scheduler is {0}".format(iam_policy_name_traditional)) self.iam_client = boto3.client("iam", region_name=region) @@ -130,7 +130,7 @@ def _create_kms_key(self, region): # create KMS key self.kms_client = boto3.client("kms", region_name=region) random_string = "".join(random.choice(string.ascii_lowercase + string.digits) for _ in range(8)) - key_alias = "alias/Integration_test_KMS_key_{0}_{1}".format(self.region, random_string) + key_alias = "alias/Integ_test_KMS_{0}_{1}".format(self.region, random_string) # If the key already existed, use the existing key for alias in self.kms_client.list_aliases().get("Aliases"): From 0695bc2e4b53f7d63e4217f3c1c73a5ae005f81b Mon Sep 17 00:00:00 2001 From: Tim Lane Date: Mon, 30 Nov 2020 16:51:44 -0800 Subject: [PATCH 42/66] Enable rollback data generation for arbitrary S3 files Previously generating rollback data was only supported by the modified code for S3 keys related to the config files used to allow list certain features/instance types. This enables the same code to be used for generating rollback data for other S3 files. The motivation for this is to enable the code to be used to generate rollback data for dependencies to be hosted in S3. Signed-off-by: Tim Lane --- util/common.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/util/common.py b/util/common.py index 87e727c79a..b2bab38fe5 100644 --- a/util/common.py +++ b/util/common.py @@ -73,10 +73,13 @@ def generate_rollback_data(regions, dest_bucket, files, sts_credentials): rollback_data[bucket_name] = {"region": region, "files": {}} doc_manager = S3DocumentManager(region, sts_credentials.get(region)) for file_type in files: + s3_path = FILE_TO_S3_PATH.get(file_type, file_type) version = doc_manager.get_current_version( - dest_bucket.format(region=region), FILE_TO_S3_PATH[file_type], raise_on_object_not_found=False + dest_bucket.format(region=region), + s3_path, + raise_on_object_not_found=False, ) - rollback_data[bucket_name]["files"][FILE_TO_S3_PATH[file_type]] = version + rollback_data[bucket_name]["files"][s3_path] = version logging.info("Rollback data:\n%s", json.dumps(rollback_data, indent=2)) rollback_file_name = "rollback-data.json" From 741b75bd24caa765ab33f00aac289dcbac9da32a Mon Sep 17 00:00:00 2001 From: Yulei Wang Date: Thu, 5 Nov 2020 14:44:51 -0800 Subject: [PATCH 43/66] integ-tests: test arm performance library Signed-off-by: Yulei Wang --- .../configs/common/common.yaml | 7 ++ .../tests/arm_pl/test_arm_pl.py | 70 +++++++++++++++++++ .../test_arm_pl/pcluster.config.ini | 20 ++++++ 3 files changed, 97 insertions(+) create mode 100644 tests/integration-tests/tests/arm_pl/test_arm_pl.py create mode 100644 tests/integration-tests/tests/arm_pl/test_arm_pl/test_arm_pl/pcluster.config.ini diff --git a/tests/integration-tests/configs/common/common.yaml b/tests/integration-tests/configs/common/common.yaml index 4b4d51de5d..57af542f04 100644 --- a/tests/integration-tests/configs/common/common.yaml +++ b/tests/integration-tests/configs/common/common.yaml @@ -1,3 +1,10 @@ +arm_pl: + test_arm_pl.py::test_arm_pl: + dimensions: + - regions: ["ap-northeast-1"] + instances: {{ common.INSTANCES_DEFAULT_ARM }} + oss: ["alinux2", "centos8", "ubuntu1804"] + schedulers: ["slurm"] cfn-init: test_cfn_init.py::test_replace_compute_on_failure: dimensions: diff --git a/tests/integration-tests/tests/arm_pl/test_arm_pl.py b/tests/integration-tests/tests/arm_pl/test_arm_pl.py new file mode 100644 index 0000000000..9208c4448c --- /dev/null +++ b/tests/integration-tests/tests/arm_pl/test_arm_pl.py @@ -0,0 +1,70 @@ +# Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). +# You may not use this file except in compliance with the License. +# A copy of the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "LICENSE.txt" file accompanying this file. +# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied. +# See the License for the specific language governing permissions and limitations under the License. +import logging + +import pytest +from assertpy import assert_that +from remote_command_executor import RemoteCommandExecutor + + +@pytest.mark.regions(["ap-northeast-1"]) +@pytest.mark.instances(["m6g.xlarge"]) +@pytest.mark.oss(["ubuntu1804", "alinux2", "centos8"]) +@pytest.mark.schedulers(["slurm"]) +def test_arm_pl(region, scheduler, instance, os, pcluster_config_reader, clusters_factory, test_datadir): + """Test Arm Performance Library""" + cluster_config = pcluster_config_reader() + cluster = clusters_factory(cluster_config) + remote_command_executor = RemoteCommandExecutor(cluster) + + # arm performance library version and gcc version + armpl_version = "20.2.1" + gcc_version = "9.3" + + # loading module armpl/{armpl_version} will load module armpl/gcc-{gcc_version} + # and armpl/{armpl_version}_gcc-{gcc_vesion} sequentially + armpl_module_general_name = f"armpl/{armpl_version}" + armpl_module_name = f"armpl/{armpl_version}_gcc-{gcc_version}" + gcc_module_name = f"armpl/gcc-{gcc_version}" + _test_armpl_examples( + remote_command_executor, + armpl_module_general_name, + armpl_module_name, + gcc_module_name, + armpl_version, + gcc_version, + ) + + +def _test_armpl_examples( + remote_command_executor, armpl_module_general_name, armpl_module_name, gcc_module_name, armpl_version, gcc_version +): + # Test arm performance library examples to check arm performance library is available in cluster + logging.info("Test arm performance library examples") + + # Load armpl module and gcc-9.3 module and assert module loaded + module_result = remote_command_executor.run_remote_command( + f"module load {armpl_module_general_name} && module list" + ).stdout + for module in [armpl_module_general_name, armpl_module_name, gcc_module_name]: + assert_that(module_result).contains(module) + + # Assert pass the example tests + remote_command_executor.run_remote_command( + f"sudo chmod 777 /opt/arm/armpl/{armpl_version}/armpl_{armpl_version}_gcc-{gcc_version}/examples" + ) + test_result = remote_command_executor.run_remote_command( + f"module load {armpl_module_general_name} && " + f"cd /opt/arm/armpl/{armpl_version}/armpl_{armpl_version}_gcc-{gcc_version}/examples && make clean && make" + ).stdout.lower() + assert_that(test_result).contains("testing: no example difference files were generated") + assert_that(test_result).contains("test passed ok") diff --git a/tests/integration-tests/tests/arm_pl/test_arm_pl/test_arm_pl/pcluster.config.ini b/tests/integration-tests/tests/arm_pl/test_arm_pl/test_arm_pl/pcluster.config.ini new file mode 100644 index 0000000000..ce19644041 --- /dev/null +++ b/tests/integration-tests/tests/arm_pl/test_arm_pl/test_arm_pl/pcluster.config.ini @@ -0,0 +1,20 @@ +[global] +cluster_template = default + +[aws] +aws_region_name = {{ region }} + +[cluster default] +base_os = {{ os }} +key_name = {{ key_name }} +vpc_settings = parallelcluster-vpc +scheduler = {{ scheduler }} +master_instance_type = {{ instance }} +compute_instance_type = {{ instance }} + + +[vpc parallelcluster-vpc] +vpc_id = {{ vpc_id }} +master_subnet_id = {{ public_subnet_id }} +compute_subnet_id = {{ private_subnet_id }} +use_public_ips = false \ No newline at end of file From f60491dbd98b4fecc5c2792b334cb4a2a2db3971 Mon Sep 17 00:00:00 2001 From: ddeidda Date: Mon, 7 Dec 2020 11:47:22 +0100 Subject: [PATCH 44/66] Enable EFA on all supported OSs but Centos8 The new EFA installer provides the EFA kmod for all supported OSs except for Centos8. This commit adds a validator to prevent EFA from being enabled on ARM architectures with Centos8. Signed-off-by: ddeidda --- CHANGELOG.md | 9 ++++- cli/src/pcluster/config/mappings.py | 4 ++- cli/src/pcluster/config/validators.py | 19 +++++++++++ cli/tests/pcluster/config/test_validators.py | 36 ++++++++++++++++++++ 4 files changed, 66 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index cf150a89e0..be7cf1e139 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,6 +14,7 @@ CHANGELOG - Remove CloudFormation DescribeStacks API call from AWS Batch Docker entrypoint. This removes the possibility of job failures due to CloudFormation throttling. - Add support for io2 EBS volume type. +- Install EFA kernel module also on ARM instances with `alinux2` and `ubuntu1804` **CHANGES** @@ -21,7 +22,13 @@ CHANGELOG - Use inclusive language in user facing messages and internal naming convention. - Change the default of instance types from the hardcoded `t2.micro` to the free tier instance type (`t2.micro` or `t3.micro` dependent on region). In regions without free tier, the default is `t3.micro`. - +- Upgrade EFA installer to version 1.11.0 + - EFA configuration: ``efa-config-1.6`` (from efa-config-1.5) + - EFA profile: ``efa-profile-1.2`` (from efa-profile-1.1) + - EFA kernel module: ``efa-1.10.2`` (no change) + - RDMA core: ``rdma-core-31.2amzn`` (from rdma-core-31.amzn0) + - Libfabric: ``libfabric-1.11.1amzn1.0`` (from libfabric-1.11.1amzn1.1) + - Open MPI: ``openmpi40-aws-4.0.5`` (no change) **BUG FIXES** diff --git a/cli/src/pcluster/config/mappings.py b/cli/src/pcluster/config/mappings.py index 7479626bc6..cfd694fb69 100644 --- a/cli/src/pcluster/config/mappings.py +++ b/cli/src/pcluster/config/mappings.py @@ -77,6 +77,7 @@ ec2_volume_validator, ec2_vpc_id_validator, efa_gdr_validator, + efa_os_arch_validator, efa_validator, efs_id_validator, efs_validator, @@ -718,6 +719,7 @@ }), ("enable_efa", { "type": BooleanJsonParam, + "validators": [efa_os_arch_validator], "update_policy": UpdatePolicy.COMPUTE_FLEET_STOP, }), ("enable_efa_gdr", { @@ -833,7 +835,7 @@ ("enable_efa", { "allowed_values": ["compute"], "cfn_param_mapping": "EFA", - "validators": [efa_validator], + "validators": [efa_validator, efa_os_arch_validator], "update_policy": UpdatePolicy.UNSUPPORTED }), ("enable_efa_gdr", { diff --git a/cli/src/pcluster/config/validators.py b/cli/src/pcluster/config/validators.py index 20f17f277e..9ff28b9658 100644 --- a/cli/src/pcluster/config/validators.py +++ b/cli/src/pcluster/config/validators.py @@ -68,6 +68,11 @@ FSX_PARAM_WITH_DEFAULT = {"drive_cache_type": "NONE"} +EFA_UNSUPPORTED_ARCHITECTURES_OSES = { + "x86_64": [], + "arm64": ["centos8"], +} + EBS_VOLUME_TYPE_TO_VOLUME_SIZE_BOUNDS = { "standard": (1, 1024), "io1": (4, 16 * 1024), @@ -1539,3 +1544,17 @@ def duplicate_shared_dir_validator(section_key, section_label, pcluster_config): errors.append("'shared_dir' can not be specified in cluster section when using multiple EBS volumes") return errors, warnings + + +def efa_os_arch_validator(param_key, param_value, pcluster_config): + errors = [] + warnings = [] + + cluster_section = pcluster_config.get_section("cluster") + architecture = cluster_section.get_param_value("architecture") + base_os = cluster_section.get_param_value("base_os") + + if base_os in EFA_UNSUPPORTED_ARCHITECTURES_OSES.get(architecture): + errors.append("EFA currently not supported on {0} for {1} architecture".format(base_os, architecture)) + + return errors, warnings diff --git a/cli/tests/pcluster/config/test_validators.py b/cli/tests/pcluster/config/test_validators.py index e603216d50..f291a9302b 100644 --- a/cli/tests/pcluster/config/test_validators.py +++ b/cli/tests/pcluster/config/test_validators.py @@ -29,6 +29,7 @@ compute_resource_validator, disable_hyperthreading_architecture_validator, efa_gdr_validator, + efa_os_arch_validator, fsx_ignored_parameters_validator, instances_architecture_compatibility_validator, intel_hpc_architecture_validator, @@ -2677,3 +2678,38 @@ def test_duplicate_shared_dir_validator( def test_extra_json_validator(mocker, capsys, extra_json, expected_message): config_parser_dict = {"cluster default": extra_json} utils.assert_param_validator(mocker, config_parser_dict, capsys=capsys, expected_warning=expected_message) + + +@pytest.mark.parametrize( + "cluster_dict, architecture, expected_error", + [ + ({"base_os": "alinux2", "enable_efa": "compute"}, "x86_64", None), + ({"base_os": "alinux2", "enable_efa": "compute"}, "arm64", None), + ({"base_os": "centos8", "enable_efa": "compute"}, "x86_64", None), + ( + {"base_os": "centos8", "enable_efa": "compute"}, + "arm64", + "EFA currently not supported on centos8 for arm64 architecture", + ), + ({"base_os": "ubuntu1804", "enable_efa": "compute"}, "x86_64", None), + ({"base_os": "ubuntu1804", "enable_efa": "compute"}, "arm64", None), + ], +) +def test_efa_os_arch_validator(mocker, cluster_dict, architecture, expected_error): + mocker.patch( + "pcluster.config.cfn_param_types.BaseOSCfnParam.get_instance_type_architecture", return_value=architecture + ) + + config_parser_dict = {"cluster default": cluster_dict} + config_parser = configparser.ConfigParser() + config_parser.read_dict(config_parser_dict) + + pcluster_config = utils.init_pcluster_config_from_configparser(config_parser, False, auto_refresh=False) + pcluster_config.get_section("cluster").get_param("architecture").value = architecture + enable_efa_value = pcluster_config.get_section("cluster").get_param_value("enable_efa") + + errors, warnings = efa_os_arch_validator("enable_efa", enable_efa_value, pcluster_config) + if expected_error: + assert_that(errors[0]).matches(expected_error) + else: + assert_that(errors).is_empty() From da8ef65ec14718dee72eabfde09924acbe3f6e2b Mon Sep 17 00:00:00 2001 From: Hanwen <68928867+hanwen-pcluste@users.noreply.github.com> Date: Tue, 15 Dec 2020 14:29:43 -0500 Subject: [PATCH 45/66] Remove the ban of using p4d as head node (#2308) * Remove the ban of using p4d as head node Signed-off-by: Hanwen * Update CHANGELOG.md Co-authored-by: Francesco De Martino --- CHANGELOG.md | 1 + cli/src/pcluster/config/validators.py | 2 +- cli/tests/pcluster/config/test_validators.py | 4 +--- cli/tests/pcluster/configure/test_pcluster_configure.py | 9 --------- .../test_multiple_nics/pcluster.config.ini | 2 +- 5 files changed, 4 insertions(+), 14 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index be7cf1e139..a359206679 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -22,6 +22,7 @@ CHANGELOG - Use inclusive language in user facing messages and internal naming convention. - Change the default of instance types from the hardcoded `t2.micro` to the free tier instance type (`t2.micro` or `t3.micro` dependent on region). In regions without free tier, the default is `t3.micro`. +- Enable support for p4d as head node instance type. (p4d was already supported as compute node in 2.10.0) - Upgrade EFA installer to version 1.11.0 - EFA configuration: ``efa-config-1.6`` (from efa-config-1.5) - EFA profile: ``efa-profile-1.2`` (from efa-profile-1.1) diff --git a/cli/src/pcluster/config/validators.py b/cli/src/pcluster/config/validators.py index 9ff28b9658..8dda0800a4 100644 --- a/cli/src/pcluster/config/validators.py +++ b/cli/src/pcluster/config/validators.py @@ -82,7 +82,7 @@ "sc1": (500, 16 * 1024), } -HEAD_NODE_UNSUPPORTED_INSTANCE_TYPES = ["p4d.24xlarge"] +HEAD_NODE_UNSUPPORTED_INSTANCE_TYPES = [] HEAD_NODE_UNSUPPORTED_MESSAGE = "The instance type '{0}' is not supported as head node." # Constants for section labels diff --git a/cli/tests/pcluster/config/test_validators.py b/cli/tests/pcluster/config/test_validators.py index f291a9302b..ff3736e47f 100644 --- a/cli/tests/pcluster/config/test_validators.py +++ b/cli/tests/pcluster/config/test_validators.py @@ -88,9 +88,7 @@ def test_ec2_instance_type_validator(mocker, instance_type, expected_message): utils.assert_param_validator(mocker, config_parser_dict, expected_message) -@pytest.mark.parametrize( - "instance_type, expected_message", [("t2.micro", None), ("c4.xlarge", None), ("p4d.24xlarge", "is not supported")] -) +@pytest.mark.parametrize("instance_type, expected_message", [("t2.micro", None), ("c4.xlarge", None)]) def test_head_node_instance_type_validator(mocker, instance_type, expected_message): config_parser_dict = {"cluster default": {"master_instance_type": instance_type}} utils.assert_param_validator(mocker, config_parser_dict, expected_message) diff --git a/cli/tests/pcluster/configure/test_pcluster_configure.py b/cli/tests/pcluster/configure/test_pcluster_configure.py index cb4b43b12c..63a9359ade 100644 --- a/cli/tests/pcluster/configure/test_pcluster_configure.py +++ b/cli/tests/pcluster/configure/test_pcluster_configure.py @@ -914,12 +914,3 @@ def test_hit_config_file(mocker, capsys, test_datadir): # Expected sys exit with error with pytest.raises(SystemExit, match="ERROR: Configuration in file .* cannot be overwritten"): _run_configuration(mocker, old_config_file, with_config=True) - - -def test_invalid_p4d_head_node_type(mocker): - with pytest.raises(StopIteration): - assert_that(general_wrapper_for_prompt_testing(mocker, head_node_instance="p4d.24xlarge")).is_true() - - -def test_valid_p4d_compute_node_type(mocker): - assert_that(general_wrapper_for_prompt_testing(mocker, compute_instance="p4d.24xlarge")).is_true() diff --git a/tests/integration-tests/tests/multiple_nics/test_multiple_nics/test_multiple_nics/pcluster.config.ini b/tests/integration-tests/tests/multiple_nics/test_multiple_nics/test_multiple_nics/pcluster.config.ini index 71e62ed1cf..ac6fc00483 100644 --- a/tests/integration-tests/tests/multiple_nics/test_multiple_nics/test_multiple_nics/pcluster.config.ini +++ b/tests/integration-tests/tests/multiple_nics/test_multiple_nics/test_multiple_nics/pcluster.config.ini @@ -8,7 +8,7 @@ cluster_template = default base_os = {{ os }} key_name = {{ key_name }} scheduler = {{ scheduler }} -master_instance_type = c5.xlarge +master_instance_type = {{ instance }} compute_instance_type = {{ instance }} initial_queue_size = 1 maintain_initial_size = true From d0e8fca64befeec8406ac79f3eeb2ff704cda5bb Mon Sep 17 00:00:00 2001 From: Rex Date: Mon, 14 Dec 2020 14:06:58 -0800 Subject: [PATCH 46/66] Integration test: add test for scaling logic when clustermgtd is down * Modify hit_scaling tests to test logic when clustermgtd is down * Computemgtd should terminate any instance in DOWN or POWER_SAVE state, or if slurmctld is down * ResumeProgram should not launch any instance if clustermgtd is down Signed-off-by: Rex --- .../tests/common/schedulers_common.py | 2 +- .../tests/scaling/test_scaling.py | 50 ++++++++++++++++--- 2 files changed, 43 insertions(+), 9 deletions(-) diff --git a/tests/integration-tests/tests/common/schedulers_common.py b/tests/integration-tests/tests/common/schedulers_common.py index 974a433776..620665aeff 100644 --- a/tests/integration-tests/tests/common/schedulers_common.py +++ b/tests/integration-tests/tests/common/schedulers_common.py @@ -443,7 +443,7 @@ def cancel_job(self, job_id): return self._remote_command_executor.run_remote_command("scancel {}".format(job_id)) def set_nodes_state(self, compute_nodes, state): - """Put nodes into down state.""" + """Put nodes into a state.""" self._remote_command_executor.run_remote_command( "sudo /opt/slurm/bin/scontrol update NodeName={} state={} reason=testing".format( ",".join(compute_nodes), state diff --git a/tests/integration-tests/tests/scaling/test_scaling.py b/tests/integration-tests/tests/scaling/test_scaling.py index 4e3b8038b8..7298f2d3a9 100644 --- a/tests/integration-tests/tests/scaling/test_scaling.py +++ b/tests/integration-tests/tests/scaling/test_scaling.py @@ -23,6 +23,7 @@ from utils import get_compute_nodes_instance_ids, get_instance_ids_compute_hostnames_conversion_dict from tests.common.assertions import ( + assert_errors_in_logs, assert_instance_replaced_or_terminating, assert_no_errors_in_logs, assert_num_instances_constant, @@ -175,7 +176,9 @@ def test_hit_scaling(scheduler, region, instance, pcluster_config_reader, cluste num_dynamic_nodes=3, dynamic_instance_type=instance, ) - _test_computemgtd_logic( + # Next test will introduce error in logs, assert no error now + assert_no_errors_in_logs(remote_command_executor, scheduler) + _test_clustermgtd_down_logic( remote_command_executor, scheduler_commands, cluster.cfn_name, @@ -187,8 +190,6 @@ def test_hit_scaling(scheduler, region, instance, pcluster_config_reader, cluste dynamic_instance_type=instance, ) - assert_no_errors_in_logs(remote_command_executor, scheduler) - def _assert_cluster_initial_conditions(scheduler_commands, instance): """Assert that expected nodes are in cluster.""" @@ -346,7 +347,7 @@ def _test_keep_or_replace_suspended_nodes( assert_num_instances_in_cluster(cluster_name, region, len(static_nodes)) -def _test_computemgtd_logic( +def _test_clustermgtd_down_logic( remote_command_executor, scheduler_commands, cluster_name, @@ -358,7 +359,7 @@ def _test_computemgtd_logic( dynamic_instance_type, ): """Test that computemgtd is able to shut nodes down when clustermgtd and slurmctld are offline.""" - logging.info("Testing that nodes are shut down when clustermgtd and slurmctld are offline") + logging.info("Testing cluster protection logic when clustermgtd is down.") submit_initial_job( scheduler_commands, "sleep infinity", @@ -367,8 +368,10 @@ def _test_computemgtd_logic( num_dynamic_nodes, other_options="--no-requeue", ) - assert_initial_conditions(scheduler_commands, num_static_nodes, num_dynamic_nodes, partition) - logging.info("Killing clustermgtd and rewriting timestamp file") + static_nodes, dynamic_nodes = assert_initial_conditions( + scheduler_commands, num_static_nodes, num_dynamic_nodes, partition + ) + logging.info("Killing clustermgtd and rewriting timestamp file to trigger timeout.") remote_command_executor.run_remote_script(str(test_datadir / "slurm_kill_clustermgtd.sh"), run_as_root=True) # Overwrite clusterctld heartbeat to trigger timeout path timestamp_format = "%Y-%m-%d %H:%M:%S.%f%z" @@ -376,13 +379,37 @@ def _test_computemgtd_logic( remote_command_executor.run_remote_command( f"echo -n '{overwrite_time_str}' | sudo tee /opt/slurm/etc/pcluster/.slurm_plugin/clustermgtd_heartbeat" ) + # Test that computemgtd will terminate compute nodes that are down or in power_save + # Put first static node and first dynamic node into DOWN + # Put rest of dynamic nodes into POWER_DOWN + logging.info("Asserting that computemgtd will terminate nodes in DOWN or POWER_SAVE") + _set_nodes_to_down_manually(scheduler_commands, static_nodes[:1] + dynamic_nodes[:1]) + _set_nodes_to_power_down_manually(scheduler_commands, dynamic_nodes[1:]) + wait_for_num_instances_in_cluster(cluster_name, region, num_static_nodes - 1) + + logging.info("Testing that ResumeProgram launches no instance when clustermgtd is down") + submit_initial_job( + scheduler_commands, + "sleep infinity", + partition, + dynamic_instance_type, + num_dynamic_nodes, + ) + logging.info("Asserting that computemgtd is not self-terminating when slurmctld is up") - assert_num_instances_constant(cluster_name, region, desired=num_static_nodes + num_dynamic_nodes, timeout=2) + assert_num_instances_constant(cluster_name, region, desired=num_static_nodes - 1, timeout=2) + logging.info("Killing slurmctld") remote_command_executor.run_remote_script(str(test_datadir / "slurm_kill_slurmctld.sh"), run_as_root=True) logging.info("Waiting for computemgtd to self-terminate all instances") wait_for_num_instances_in_cluster(cluster_name, region, 0) + assert_errors_in_logs( + remote_command_executor, + ["/var/log/parallelcluster/slurm_resume.log"], + ["No valid clustermgtd heartbeat detected"], + ) + @retry(wait_fixed=seconds(30), stop_max_delay=minutes(15)) def _assert_failing_nodes_terminated(nodes_to_remove, hostname_to_instance_id, region): @@ -447,6 +474,13 @@ def _set_nodes_to_down_manually(scheduler_commands, compute_nodes): _assert_compute_node_states(scheduler_commands, compute_nodes, expected_states=["down"]) +def _set_nodes_to_power_down_manually(scheduler_commands, compute_nodes): + scheduler_commands.set_nodes_state(compute_nodes, state="power_down") + time.sleep(5) + scheduler_commands.set_nodes_state(compute_nodes, state="resume") + _assert_compute_node_states(scheduler_commands, compute_nodes, expected_states=["idle~"]) + + def _assert_compute_node_states(scheduler_commands, compute_nodes, expected_states): node_states = scheduler_commands.get_nodes_status(compute_nodes) for node in compute_nodes: From 95e38eae62c1b363cf5cc36ca1328ac006b619cf Mon Sep 17 00:00:00 2001 From: Yulei <68350383+yuleiwan@users.noreply.github.com> Date: Tue, 15 Dec 2020 16:36:23 -0800 Subject: [PATCH 47/66] integ-tests: add centos8 to dcv tests (#2310) Signed-off-by: Yulei Wang --- tests/integration-tests/configs/common/common.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/integration-tests/configs/common/common.yaml b/tests/integration-tests/configs/common/common.yaml index 57af542f04..c0139d0136 100644 --- a/tests/integration-tests/configs/common/common.yaml +++ b/tests/integration-tests/configs/common/common.yaml @@ -137,12 +137,12 @@ dcv: # DCV on GPU enabled instance - regions: ["eu-west-1"] instances: ["g3.8xlarge"] - oss: ["alinux2", "centos7", "ubuntu1804"] + oss: ["alinux2", "centos7", "centos8", "ubuntu1804"] schedulers: ["slurm"] # DCV on ARM - regions: ["eu-west-1"] instances: {{ common.INSTANCES_DEFAULT_ARM }} - oss: ["alinux2", "ubuntu1804"] + oss: ["alinux2", "centos8", "ubuntu1804"] schedulers: ["slurm"] # DCV in cn regions and non GPU enabled instance - regions: ["cn-northwest-1"] From 3621fb8137809965c84f9c5367ecc2539b52b597 Mon Sep 17 00:00:00 2001 From: Hanwen <68928867+hanwen-pcluste@users.noreply.github.com> Date: Tue, 15 Dec 2020 19:46:44 -0500 Subject: [PATCH 48/66] Add iam_lambda_role parameter under cluster section in the config file (#2304) 1. Add `iam_lambda_role` parameter to the config file. If specified, this role will be attached to all Lambda function resources created by CloudFormation Templates. 2. If both `ec2_iam_role` and `iam_lambda_role` are provided, and the scheduler is `sge`, `torque`, or `slurm`, there will be no created by `pcluster` commands. Note that if `awsbatch` is the scheduler, there will be role created during `pcluster create`. 3. Integration tests: Extract some functions (role creation, policy creation) from `storage.kms_key_factory` to `conftest`. The code in `kms_key_factory` is kept untouched to limit the scale of this commit. Signed-off-by: Hanwen --- CHANGELOG.md | 4 + cli/src/pcluster/config/mappings.py | 4 + cli/src/pcluster/examples/config | 3 + cli/tests/pcluster/config/defaults.py | 8 +- cloudformation/aws-parallelcluster.cfn.json | 36 ++++- cloudformation/batch-substack.cfn.json | 46 ++++-- .../compute-fleet-hit-substack.cfn.yaml | 17 ++- .../configs/common/common.yaml | 10 +- .../integration-tests/configs/new_region.yaml | 4 +- tests/integration-tests/conftest.py | 111 ++++++++++++++- .../batch_lambda_function_policy.json | 44 ++++++ .../traditional_lambda_function_policy.json | 64 +++++++++ tests/integration-tests/tests/common/utils.py | 5 + tests/integration-tests/tests/iam/test_iam.py | 133 ++++++++++++++++++ .../test_iam_policies/pcluster.config.ini | 0 .../tests/iam/test_iam/test_iam_roles/HIT.ini | 29 ++++ .../tests/iam/test_iam/test_iam_roles/SIT.ini | 28 ++++ .../tests/iam_policies/test_iam_policies.py | 54 ------- 18 files changed, 525 insertions(+), 75 deletions(-) create mode 100644 tests/integration-tests/resources/batch_lambda_function_policy.json create mode 100644 tests/integration-tests/resources/traditional_lambda_function_policy.json create mode 100644 tests/integration-tests/tests/iam/test_iam.py rename tests/integration-tests/tests/{iam_policies/test_iam_policies => iam/test_iam}/test_iam_policies/pcluster.config.ini (100%) create mode 100644 tests/integration-tests/tests/iam/test_iam/test_iam_roles/HIT.ini create mode 100644 tests/integration-tests/tests/iam/test_iam/test_iam_roles/SIT.ini delete mode 100644 tests/integration-tests/tests/iam_policies/test_iam_policies.py diff --git a/CHANGELOG.md b/CHANGELOG.md index a359206679..35a0769fb4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,6 +15,10 @@ CHANGELOG failures due to CloudFormation throttling. - Add support for io2 EBS volume type. - Install EFA kernel module also on ARM instances with `alinux2` and `ubuntu1804` +- Add `iam_lambda_role` parameter under `cluster` section to enable the possibility to specify an existing IAM role to + be used by AWS Lambda functions in CloudFormation. + When using `sge`, `torque`, or `slurm` as the scheduler, + `pcluster` will not create any IAM role if both `ec2_iam_role` and `iam_lambda_role` are provided. **CHANGES** diff --git a/cli/src/pcluster/config/mappings.py b/cli/src/pcluster/config/mappings.py index cfd694fb69..36e4575a54 100644 --- a/cli/src/pcluster/config/mappings.py +++ b/cli/src/pcluster/config/mappings.py @@ -1007,6 +1007,10 @@ "validators": [s3_bucket_validator], "update_policy": UpdatePolicy.READ_ONLY_RESOURCE_BUCKET, }), + ("iam_lambda_role", { + "cfn_param_mapping": "IAMLambdaRoleName", + "update_policy": UpdatePolicy.SUPPORTED, + }), ] diff --git a/cli/src/pcluster/examples/config b/cli/src/pcluster/examples/config index f3494474ba..914766f190 100644 --- a/cli/src/pcluster/examples/config +++ b/cli/src/pcluster/examples/config @@ -109,6 +109,9 @@ key_name = mykey # Existing EC2 IAM policies to be associated with the EC2 instances # (defaults to NONE) #additional_iam_policies = NONE +# Existing IAM role to be associated with Lambda functions +# (defaults to NONE) +#iam_lambda_role = NONE # Disable Hyperthreading on all instances # (defaults to False) #disable_hyperthreading = false diff --git a/cli/tests/pcluster/config/defaults.py b/cli/tests/pcluster/config/defaults.py index 73d0e4cae4..fec837315a 100644 --- a/cli/tests/pcluster/config/defaults.py +++ b/cli/tests/pcluster/config/defaults.py @@ -145,6 +145,7 @@ "architecture": "x86_64", "network_interfaces_count": ["1", "1"], "cluster_resource_bucket": None, + "iam_lambda_role": None, } DEFAULT_CLUSTER_HIT_DICT = { @@ -194,6 +195,7 @@ "architecture": "x86_64", "network_interfaces_count": ["1", "1"], "cluster_resource_bucket": None, # cluster_resource_bucket no default, but this is here to make testing easier + "iam_lambda_role": None, } DEFAULT_CW_LOG_DICT = {"enable": True, "retention_days": 14} @@ -226,8 +228,8 @@ class DefaultDict(Enum): # ------------------ Default CFN parameters ------------------ # # number of CFN parameters created by the PclusterConfig object. -CFN_SIT_CONFIG_NUM_OF_PARAMS = 61 -CFN_HIT_CONFIG_NUM_OF_PARAMS = 52 +CFN_SIT_CONFIG_NUM_OF_PARAMS = 62 +CFN_HIT_CONFIG_NUM_OF_PARAMS = 53 # CFN parameters created by the pcluster CLI CFN_CLI_RESERVED_PARAMS = ["ArtifactS3RootDirectory", "RemoveBucketOnDeletion"] @@ -343,6 +345,7 @@ class DefaultDict(Enum): # architecture "Architecture": "x86_64", "NetworkInterfacesCount": "1,1", + "IAMLambdaRoleName": "NONE", } @@ -412,6 +415,7 @@ class DefaultDict(Enum): # architecture "Architecture": "x86_64", "NetworkInterfacesCount": "1,1", + "IAMLambdaRoleName": "NONE", } diff --git a/cloudformation/aws-parallelcluster.cfn.json b/cloudformation/aws-parallelcluster.cfn.json index ad01d2604f..3ecbb288e4 100644 --- a/cloudformation/aws-parallelcluster.cfn.json +++ b/cloudformation/aws-parallelcluster.cfn.json @@ -285,6 +285,11 @@ "Type": "CommaDelimitedList", "Default": "NONE" }, + "IAMLambdaRoleName": { + "Description": "Existing IAM role name for Lambda functions", + "Type": "String", + "Default": "NONE" + }, "VPCSecurityGroupId": { "Description": "Existing VPC security group Id", "Type": "String", @@ -660,6 +665,14 @@ "NONE" ] }, + "CreateIAMLambdaRole": { + "Fn::Equals": [ + { + "Ref": "IAMLambdaRoleName" + }, + "NONE" + ] + }, "AddHITIamPolicies": { "Fn::And": [ { @@ -2350,6 +2363,9 @@ "MasterServerSubstack", "Outputs.MasterPrivateIP" ] + }, + "IAMLambdaRoleName": { + "Ref": "IAMLambdaRoleName" } }, "TemplateURL": { @@ -2510,7 +2526,8 @@ "PolicyName": "LambdaPolicy" } ] - } + }, + "Condition": "CreateIAMLambdaRole" }, "CleanupResourcesS3BucketCustomResource": { "Type": "AWS::CloudFormation::CustomResource", @@ -2587,9 +2604,17 @@ "Handler": "cleanup_resources.handler", "MemorySize": 128, "Role": { - "Fn::GetAtt": [ - "CleanupResourcesFunctionExecutionRole", - "Arn" + "Fn::If": [ + "CreateIAMLambdaRole", + { + "Fn::GetAtt": [ + "CleanupResourcesFunctionExecutionRole", + "Arn" + ] + }, + { + "Fn::Sub": "arn:${AWS::Partition}:iam::${AWS::AccountId}:role/${IAMLambdaRoleName}" + } ] }, "Runtime": "python3.8", @@ -3963,6 +3988,9 @@ } ] }, + "IAMLambdaRoleName": { + "Ref": "IAMLambdaRoleName" + }, "ResourcesS3Bucket": { "Ref": "ResourcesS3Bucket" }, diff --git a/cloudformation/batch-substack.cfn.json b/cloudformation/batch-substack.cfn.json index 4be334dd4d..980fceb0fd 100644 --- a/cloudformation/batch-substack.cfn.json +++ b/cloudformation/batch-substack.cfn.json @@ -99,6 +99,10 @@ "MasterPrivateIP": { "Description": "Private IP of the head node", "Type": "String" + }, + "IAMLambdaRoleName": { + "Description": "Existing IAM role name for Lambda functions", + "Type": "String" } }, "Conditions": { @@ -117,6 +121,14 @@ }, "arm64" ] + }, + "CreateIAMLambdaRole": { + "Fn::Equals": [ + { + "Ref": "IAMLambdaRoleName" + }, + "NONE" + ] } }, "Resources": { @@ -821,9 +833,17 @@ "Handler": "manage_docker_images.handler", "MemorySize": 128, "Role": { - "Fn::GetAtt": [ - "ManageDockerImagesFunctionExecutionRole", - "Arn" + "Fn::If": [ + "CreateIAMLambdaRole", + { + "Fn::GetAtt": [ + "ManageDockerImagesFunctionExecutionRole", + "Arn" + ] + }, + { + "Fn::Sub": "arn:${AWS::Partition}:iam::${AWS::AccountId}:role/${IAMLambdaRoleName}" + } ] }, "Runtime": "python3.6", @@ -897,7 +917,8 @@ "PolicyName": "LambdaPolicy" } ] - } + }, + "Condition": "CreateIAMLambdaRole" }, "DockerBuildWaitHandle": { "Type": "AWS::CloudFormation::WaitConditionHandle", @@ -952,7 +973,8 @@ "PolicyName": "LambdaPolicy" } ] - } + }, + "Condition": "CreateIAMLambdaRole" }, "SendBuildNotificationFunction": { "Type": "AWS::Lambda::Function", @@ -971,9 +993,17 @@ "Handler": "send_build_notification.handler", "MemorySize": 128, "Role": { - "Fn::GetAtt": [ - "SendBuildNotificationFunctionExecutionRole", - "Arn" + "Fn::If": [ + "CreateIAMLambdaRole", + { + "Fn::GetAtt": [ + "SendBuildNotificationFunctionExecutionRole", + "Arn" + ] + }, + { + "Fn::Sub": "arn:${AWS::Partition}:iam::${AWS::AccountId}:role/${IAMLambdaRoleName}" + } ] }, "Runtime": "python3.6", diff --git a/cloudformation/compute-fleet-hit-substack.cfn.yaml b/cloudformation/compute-fleet-hit-substack.cfn.yaml index 69c1df4e0d..3efd304eb1 100644 --- a/cloudformation/compute-fleet-hit-substack.cfn.yaml +++ b/cloudformation/compute-fleet-hit-substack.cfn.yaml @@ -85,6 +85,8 @@ Parameters: Type: AWS::EC2::VPC::Id RootRole: Type: String + IAMLambdaRoleName: + Type: String ResourcesS3Bucket: Type: String ArtifactS3RootDirectory: @@ -101,6 +103,9 @@ Conditions: UseAssociatePublicIpAddress: !Equals - !Ref 'AssociatePublicIpAddress' - true + CreateIAMLambdaRole: !Equals + - !Ref 'IAMLambdaRoleName' + - NONE Resources: {%- for queue, queue_config in queues.items() %} {%- for compute_resource in queue_config.compute_resource_settings.values() %} @@ -528,7 +533,10 @@ Resources: S3Key: !Sub '${ArtifactS3RootDirectory}/custom_resources_code/artifacts.zip' Handler: cleanup_resources.handler MemorySize: 128 - Role: !GetAtt 'CleanupRoute53FunctionExecutionRole.Arn' + Role: !If + - CreateIAMLambdaRole + - !GetAtt 'CleanupRoute53FunctionExecutionRole.Arn' + - !Sub 'arn:${AWS::Partition}:iam::${AWS::AccountId}:role/${IAMLambdaRoleName}' Runtime: python3.8 Timeout: 900 CleanupRoute53CustomResource: @@ -570,6 +578,7 @@ Resources: - ClusterHostedZone: !Ref 'ClusterHostedZone' Version: '2012-10-17' PolicyName: LambdaPolicy + Condition: CreateIAMLambdaRole {%- endif %} UpdateWaiterFunction: Type: AWS::Lambda::Function @@ -580,7 +589,10 @@ Resources: S3Key: !Sub '${ArtifactS3RootDirectory}/custom_resources_code/artifacts.zip' Handler: wait_for_update.handler MemorySize: 128 - Role: !GetAtt 'UpdateWaiterFunctionExecutionRole.Arn' + Role: !If + - CreateIAMLambdaRole + - !GetAtt 'UpdateWaiterFunctionExecutionRole.Arn' + - !Sub 'arn:${AWS::Partition}:iam::${AWS::AccountId}:role/${IAMLambdaRoleName}' Runtime: python3.8 Timeout: 900 UpdateWaiterFunctionExecutionRole: @@ -613,6 +625,7 @@ Resources: Resource: !Sub 'arn:${AWS::Partition}:dynamodb:${AWS::Region}:${AWS::AccountId}:table/${DynamoDBTable}' Version: '2012-10-17' PolicyName: LambdaPolicy + Condition: CreateIAMLambdaRole Metadata: RootRole: !Ref 'RootRole' VPCId: !Ref 'VPCId' diff --git a/tests/integration-tests/configs/common/common.yaml b/tests/integration-tests/configs/common/common.yaml index c0139d0136..d1795abc65 100644 --- a/tests/integration-tests/configs/common/common.yaml +++ b/tests/integration-tests/configs/common/common.yaml @@ -219,13 +219,19 @@ efa: instances: ["p4d.24xlarge"] oss: ["alinux", "ubuntu1804", "centos7"] schedulers: ["sge"] -iam_policies: - test_iam_policies.py::test_iam_policies: +iam: + test_iam.py::test_iam_policies: dimensions: - regions: ["eu-north-1"] instances: {{ common.INSTANCES_DEFAULT_X86 }} oss: ["alinux2"] schedulers: ["slurm", "awsbatch"] + test_iam.py::test_iam_roles: + dimensions: + - regions: ["us-east-2"] + schedulers: ["awsbatch", "slurm", "sge"] + oss: ["alinux2"] + instances: {{ common.INSTANCES_DEFAULT_X86 }} intel_hpc: test_intel_hpc.py::test_intel_hpc: dimensions: diff --git a/tests/integration-tests/configs/new_region.yaml b/tests/integration-tests/configs/new_region.yaml index e77e1f3ed0..3f7b79e4df 100644 --- a/tests/integration-tests/configs/new_region.yaml +++ b/tests/integration-tests/configs/new_region.yaml @@ -108,8 +108,8 @@ test-suites: instances: ["c5n.18xlarge"] oss: ["alinux2"] schedulers: ["slurm"] - iam_policies: - test_iam_policies.py::test_iam_policies: + iam: + test_iam.py::test_iam_policies: dimensions: - regions: {{ NEW_REGION }} instances: {{ common.INSTANCES_DEFAULT_X86 }} diff --git a/tests/integration-tests/conftest.py b/tests/integration-tests/conftest.py index 50cdac7782..ae7beee5f0 100644 --- a/tests/integration-tests/conftest.py +++ b/tests/integration-tests/conftest.py @@ -18,11 +18,13 @@ import os import random import re +import time from shutil import copyfile from traceback import format_tb import boto3 import configparser +import pkg_resources import pytest from cfn_stacks_factory import CfnStack, CfnStacksFactory from clusters_factory import Cluster, ClustersFactory @@ -52,7 +54,7 @@ unset_credentials, ) -from tests.common.utils import retrieve_pcluster_ami_without_standard_naming +from tests.common.utils import get_sts_endpoint, retrieve_pcluster_ami_without_standard_naming def pytest_addoption(parser): @@ -293,6 +295,11 @@ def _cluster_factory(cluster_config, extra_args=None, raise_on_error=True): ) +@pytest.fixture(scope="class") +def cluster_model(scheduler): + return "HIT" if scheduler == "slurm" else "SIT" + + def _write_cluster_config_to_outdir(request, cluster_config): out_dir = request.config.getoption("output_dir") @@ -618,6 +625,108 @@ def vpc_stacks(cfn_stacks_factory, request): return vpc_stacks +@pytest.fixture(scope="class") +def common_pcluster_policies(region): + """Create four policies to be attached to ec2_iam_role, iam_lamda_role for awsbatch or traditional schedulers.""" + policies = {} + policies["awsbatch_instance_policy"] = _create_iam_policies( + "integ-tests-ParallelClusterInstancePolicy-batch-" + random_alphanumeric(), region, "batch_instance_policy.json" + ) + policies["traditional_instance_policy"] = _create_iam_policies( + "integ-tests-ParallelClusterInstancePolicy-traditional-" + random_alphanumeric(), + region, + "traditional_instance_policy.json", + ) + policies["awsbatch_lambda_policy"] = _create_iam_policies( + "integ-tests-ParallelClusterLambdaPolicy-batch-" + random_alphanumeric(), + region, + "batch_lambda_function_policy.json", + ) + policies["traditional_lambda_policy"] = _create_iam_policies( + "integ-tests-ParallelClusterLambdaPolicy-traditional-" + random_alphanumeric(), + region, + "traditional_lambda_function_policy.json", + ) + + yield policies + + iam_client = boto3.client("iam", region_name=region) + for policy in policies.values(): + iam_client.delete_policy(PolicyArn=policy) + + +@pytest.fixture(scope="class") +def role_factory(region): + roles = [] + iam_client = boto3.client("iam", region_name=region) + + def create_role(trusted_service, policies=()): + iam_role_name = f"integ-tests_{trusted_service}_{region}_{random_alphanumeric()}" + logging.info(f"Creating iam role {iam_role_name} for {trusted_service}") + + partition = _get_arn_partition(region) + domain_suffix = ".cn" if partition == "aws-cn" else "" + + trust_relationship_policy_ec2 = { + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Principal": {"Service": f"{trusted_service}.amazonaws.com{domain_suffix}"}, + "Action": "sts:AssumeRole", + } + ], + } + iam_client.create_role( + RoleName=iam_role_name, + AssumeRolePolicyDocument=json.dumps(trust_relationship_policy_ec2), + Description="Role for create custom KMS key", + ) + + logging.info(f"Attaching iam policy to the role {iam_role_name}...") + for policy in policies: + iam_client.attach_role_policy(RoleName=iam_role_name, PolicyArn=policy) + + # Having time.sleep here because because it take a while for the the IAM role to become valid for use in the + # put_key_policy step for creating KMS key, read the following link for reference : + # https://stackoverflow.com/questions/20156043/how-long-should-i-wait-after-applying-an-aws-iam-policy-before-it-is-valid + time.sleep(60) + logging.info(f"Iam role is ready: {iam_role_name}") + roles.append({"role_name": iam_role_name, "policies": policies}) + return iam_role_name + + yield create_role + + for role in roles: + role_name = role["role_name"] + policies = role["policies"] + for policy in policies: + iam_client.detach_role_policy(RoleName=role_name, PolicyArn=policy) + logging.info(f"Deleting iam role {role_name}") + iam_client.delete_role(RoleName=role_name) + + +def _create_iam_policies(iam_policy_name, region, policy_filename): + logging.info("Creating iam policy {0}...".format(iam_policy_name)) + file_loader = FileSystemLoader(pkg_resources.resource_filename(__name__, "/resources")) + env = Environment(loader=file_loader, trim_blocks=True, lstrip_blocks=True) + partition = _get_arn_partition(region) + account_id = ( + boto3.client("sts", region_name=region, endpoint_url=get_sts_endpoint(region)) + .get_caller_identity() + .get("Account") + ) + parallel_cluster_instance_policy = env.get_template(policy_filename).render( + partition=partition, + region=region, + account_id=account_id, + cluster_bucket_name="parallelcluster-*", + ) + return boto3.client("iam", region_name=region).create_policy( + PolicyName=iam_policy_name, PolicyDocument=parallel_cluster_instance_policy + )["Policy"]["Arn"] + + @pytest.fixture(scope="class") def vpc_stack(vpc_stacks, region): return vpc_stacks[region] diff --git a/tests/integration-tests/resources/batch_lambda_function_policy.json b/tests/integration-tests/resources/batch_lambda_function_policy.json new file mode 100644 index 0000000000..ed92d0fbf2 --- /dev/null +++ b/tests/integration-tests/resources/batch_lambda_function_policy.json @@ -0,0 +1,44 @@ +{ + "Version": "2012-10-17", + "Statement": [ + { + "Action": [ + "logs:CreateLogStream", + "logs:PutLogEvents" + ], + "Effect": "Allow", + "Resource": "arn:{{ partition }}:logs:*:*:*", + "Sid": "CloudWatchLogsPolicy" + }, + { + "Action": [ + "ecr:BatchDeleteImage", + "ecr:ListImages" + ], + "Effect": "Allow", + "Resource": "*", + "Sid": "ECRPolicy" + }, + { + "Action": [ + "codebuild:BatchGetBuilds", + "codebuild:StartBuild" + ], + "Effect": "Allow", + "Resource": "*", + "Sid": "CodeBuildPolicy" + }, + { + "Action": [ + "s3:DeleteBucket", + "s3:DeleteObject", + "s3:DeleteObjectVersion", + "s3:ListBucket", + "s3:ListBucketVersions" + ], + "Effect": "Allow", + "Resource": "*", + "Sid": "S3BucketPolicy" + } + ] +} \ No newline at end of file diff --git a/tests/integration-tests/resources/traditional_lambda_function_policy.json b/tests/integration-tests/resources/traditional_lambda_function_policy.json new file mode 100644 index 0000000000..a3545486ac --- /dev/null +++ b/tests/integration-tests/resources/traditional_lambda_function_policy.json @@ -0,0 +1,64 @@ +{ + "Version": "2012-10-17", + "Statement": [ + { + "Action": [ + "logs:CreateLogStream", + "logs:PutLogEvents" + ], + "Resource": "arn:{{ partition }}:logs:*:*:*", + "Effect": "Allow", + "Sid": "CloudWatchLogsPolicy" + }, + { + "Action": [ + "s3:DeleteBucket", + "s3:DeleteObject", + "s3:DeleteObjectVersion", + "s3:ListBucket", + "s3:ListBucketVersions" + ], + "Resource": [ + "arn:{{ partition }}:s3:::*" + ], + "Effect": "Allow", + "Sid": "S3BucketPolicy" + }, + { + "Action": [ + "ec2:DescribeInstances" + ], + "Resource": "*", + "Effect": "Allow", + "Sid": "DescribeInstances" + }, + { + "Action": [ + "ec2:TerminateInstances" + ], + "Resource": "*", + "Effect": "Allow", + "Sid": "FleetTerminatePolicy" + }, + { + "Action": [ + "dynamodb:GetItem", + "dynamodb:PutItem" + ], + "Resource": "arn:{{ partition }}:dynamodb:{{ region }}:{{ account_id }}:table/parallelcluster-*", + "Effect": "Allow", + "Sid": "DynamoDBTable" + }, + { + "Action": [ + "route53:ListResourceRecordSets", + "route53:ChangeResourceRecordSets" + ], + "Resource": [ + "arn:{{ partition }}:route53:::hostedzone/*" + ], + "Effect": "Allow", + "Sid": "Route53DeletePolicy" + } + ] +} \ No newline at end of file diff --git a/tests/integration-tests/tests/common/utils.py b/tests/integration-tests/tests/common/utils.py index d9ee887512..53bb8172c6 100644 --- a/tests/integration-tests/tests/common/utils.py +++ b/tests/integration-tests/tests/common/utils.py @@ -144,3 +144,8 @@ def _assert_ami_is_available(region, ami_id): def get_installed_parallelcluster_version(): """Get the version of the installed aws-parallelcluster package.""" return pkg_resources.get_distribution("aws-parallelcluster").version + + +def get_sts_endpoint(region): + """Get regionalized STS endpoint.""" + return "https://sts.{0}.{1}".format(region, "amazonaws.com.cn" if region.startswith("cn-") else "amazonaws.com") diff --git a/tests/integration-tests/tests/iam/test_iam.py b/tests/integration-tests/tests/iam/test_iam.py new file mode 100644 index 0000000000..632071f269 --- /dev/null +++ b/tests/integration-tests/tests/iam/test_iam.py @@ -0,0 +1,133 @@ +# Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). +# You may not use this file except in compliance with the License. +# A copy of the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "LICENSE.txt" file accompanying this file. +# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied. +# See the License for the specific language governing permissions and limitations under the License. +import logging +import os +from shutil import copyfile + +import boto3 +import pytest +from assertpy import assert_that +from remote_command_executor import RemoteCommandExecutor + +from tests.common.assertions import assert_no_errors_in_logs + + +@pytest.mark.usefixtures("os", "instance") +def test_iam_roles( + region, + scheduler, + common_pcluster_policies, + role_factory, + pcluster_config_reader, + clusters_factory, + cluster_model, + test_datadir, +): + is_awsbatch = scheduler == "awsbatch" + if is_awsbatch: + instance_policies = common_pcluster_policies["awsbatch_instance_policy"] + lambda_policies = common_pcluster_policies["awsbatch_lambda_policy"] + else: + instance_policies = common_pcluster_policies["traditional_instance_policy"] + lambda_policies = common_pcluster_policies["traditional_lambda_policy"] + cluster_role_name = role_factory("ec2", [instance_policies]) + lambda_role_name = role_factory("lambda", [lambda_policies]) + + # Copy the config file template for reuse in update. + config_file_name = cluster_model + ".ini" + config_file_path = os.path.join(str(test_datadir), config_file_name) + updated_config_file_name = cluster_model + ".update.ini" + updated_config_file_path = os.path.join(str(test_datadir), updated_config_file_name) + copyfile(config_file_path, updated_config_file_path) + + cluster_config = pcluster_config_reader( + config_file=config_file_name, ec2_iam_role=cluster_role_name, iam_lambda_role=lambda_role_name + ) + cluster = clusters_factory(cluster_config) + + main_stack_name = "parallelcluster-" + cluster.name + cfn_client = boto3.client("cloudformation", region_name=region) + lambda_client = boto3.client("lambda", region_name=region) + + # Check all CloudFormation stacks after creation + # If scheduler is awsbatch, there will still be IAM roles created. + _check_lambda_role(cfn_client, lambda_client, main_stack_name, lambda_role_name, not is_awsbatch) + + # Test updating the iam_lambda_role + updated_lambda_role_name = role_factory("lambda", [lambda_policies]) + assert_that(updated_lambda_role_name == lambda_role_name).is_false() + cluster.config_file = str( + pcluster_config_reader( + config_file=updated_config_file_name, + ec2_iam_role=cluster_role_name, + iam_lambda_role=updated_lambda_role_name, + ) + ) + cluster.update() + + # Check all CloudFormation stacks after update + _check_lambda_role(cfn_client, lambda_client, main_stack_name, updated_lambda_role_name, not is_awsbatch) + + +def _check_lambda_role(cfn_client, lambda_client, stack_name, lambda_role_name, check_no_role_is_created): + """Test lambda role is attached to all Lambda functions in the stack and its substack.""" + resources = cfn_client.describe_stack_resources(StackName=stack_name)["StackResources"] + for resource in resources: + resource_type = resource["ResourceType"] + if check_no_role_is_created: + # If check_no_role_is_created, check that there is no role created in the stack and its substack. + assert_that(resource_type).is_not_equal_to("AWS::IAM::Role") + if resource_type == "AWS::CloudFormation::Stack": + # Recursively check substacks + _check_lambda_role( + cfn_client, lambda_client, resource["PhysicalResourceId"], lambda_role_name, check_no_role_is_created + ) + if resource_type == "AWS::Lambda::Function": + # Check the role is attached to the Lambda function + lambda_function = lambda_client.get_function(FunctionName=resource["PhysicalResourceId"])["Configuration"] + assert_that(lambda_role_name in lambda_function["Role"]).is_true() + + +@pytest.mark.regions(["ap-northeast-2"]) +@pytest.mark.schedulers(["slurm", "awsbatch"]) +@pytest.mark.oss(["alinux2"]) +@pytest.mark.usefixtures("os", "instance") +def test_iam_policies(region, scheduler, pcluster_config_reader, clusters_factory): + """Test IAM Policies""" + cluster_config = pcluster_config_reader( + iam_policies="arn:aws:iam::aws:policy/AmazonS3ReadOnlyAccess, arn:aws:iam::aws:policy/AWSBatchFullAccess" + ) + cluster = clusters_factory(cluster_config) + remote_command_executor = RemoteCommandExecutor(cluster) + + _test_s3_access(remote_command_executor, region) + + if scheduler == "awsbatch": + _test_batch_access(remote_command_executor, region) + + assert_no_errors_in_logs(remote_command_executor, scheduler) + + +def _test_s3_access(remote_command_executor, region): + logging.info("Testing S3 Access") + result = remote_command_executor.run_remote_command(f"AWS_DEFAULT_REGION={region} aws s3 ls").stdout + # An error occurred (AccessDenied) when calling the ListBuckets operation: Access Denied + assert_that(result).does_not_contain("AccessDenied") + + +def _test_batch_access(remote_command_executor, region): + logging.info("Testing AWS Batch Access") + result = remote_command_executor.run_remote_command( + f"AWS_DEFAULT_REGION={region} aws batch describe-compute-environments" + ).stdout + # An error occurred (AccessDeniedException) when calling the DescribeComputeEnvironments operation: ... + assert_that(result).does_not_contain("AccessDeniedException") diff --git a/tests/integration-tests/tests/iam_policies/test_iam_policies/test_iam_policies/pcluster.config.ini b/tests/integration-tests/tests/iam/test_iam/test_iam_policies/pcluster.config.ini similarity index 100% rename from tests/integration-tests/tests/iam_policies/test_iam_policies/test_iam_policies/pcluster.config.ini rename to tests/integration-tests/tests/iam/test_iam/test_iam_policies/pcluster.config.ini diff --git a/tests/integration-tests/tests/iam/test_iam/test_iam_roles/HIT.ini b/tests/integration-tests/tests/iam/test_iam/test_iam_roles/HIT.ini new file mode 100644 index 0000000000..da7e4e06c6 --- /dev/null +++ b/tests/integration-tests/tests/iam/test_iam/test_iam_roles/HIT.ini @@ -0,0 +1,29 @@ +[global] +cluster_template = default + +[aws] +aws_region_name = {{ region }} + +[cluster default] +key_name = {{ key_name }} +vpc_settings = parallelcluster-vpc +scheduler = {{ scheduler }} +master_instance_type = {{ instance }} +base_os = {{ os }} +queue_settings = compute +ec2_iam_role = {{ ec2_iam_role }} +iam_lambda_role = {{ iam_lambda_role }} + +[vpc parallelcluster-vpc] +vpc_id = {{ vpc_id }} +master_subnet_id = {{ public_subnet_id }} +compute_subnet_id = {{ private_subnet_id }} +use_public_ips = false + +[queue compute] +enable_efa = false +enable_efa_gdr = false +compute_resource_settings = default + +[compute_resource default] +instance_type = {{ instance }} diff --git a/tests/integration-tests/tests/iam/test_iam/test_iam_roles/SIT.ini b/tests/integration-tests/tests/iam/test_iam/test_iam_roles/SIT.ini new file mode 100644 index 0000000000..79707cc3a7 --- /dev/null +++ b/tests/integration-tests/tests/iam/test_iam/test_iam_roles/SIT.ini @@ -0,0 +1,28 @@ +[global] +cluster_template = default + +[aws] +aws_region_name = {{ region }} + +[cluster default] +key_name = {{ key_name }} +vpc_settings = parallelcluster-vpc +scheduler = {{ scheduler }} +master_instance_type = {{ instance }} +compute_instance_type = {{ instance }} +base_os = {{ os }} +ec2_iam_role = {{ ec2_iam_role }} +iam_lambda_role = {{ iam_lambda_role }} +{% if scheduler == "awsbatch" %} +min_vcpus = 1 +desired_vcpus = 1 +{% else %} +initial_queue_size = 1 +maintain_initial_size = true +{% endif %} + +[vpc parallelcluster-vpc] +vpc_id = {{ vpc_id }} +master_subnet_id = {{ public_subnet_id }} +compute_subnet_id = {{ private_subnet_id }} +use_public_ips = false diff --git a/tests/integration-tests/tests/iam_policies/test_iam_policies.py b/tests/integration-tests/tests/iam_policies/test_iam_policies.py deleted file mode 100644 index f14a49dd19..0000000000 --- a/tests/integration-tests/tests/iam_policies/test_iam_policies.py +++ /dev/null @@ -1,54 +0,0 @@ -# Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"). -# You may not use this file except in compliance with the License. -# A copy of the License is located at -# -# http://aws.amazon.com/apache2.0/ -# -# or in the "LICENSE.txt" file accompanying this file. -# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied. -# See the License for the specific language governing permissions and limitations under the License. -import logging - -import pytest -from assertpy import assert_that -from remote_command_executor import RemoteCommandExecutor - -from tests.common.assertions import assert_no_errors_in_logs - - -@pytest.mark.regions(["ap-northeast-2"]) -@pytest.mark.schedulers(["slurm", "awsbatch"]) -@pytest.mark.oss(["alinux2"]) -@pytest.mark.usefixtures("os", "instance") -def test_iam_policies(region, scheduler, pcluster_config_reader, clusters_factory): - """Test IAM Policies""" - cluster_config = pcluster_config_reader( - iam_policies="arn:aws:iam::aws:policy/AmazonS3ReadOnlyAccess, arn:aws:iam::aws:policy/AWSBatchFullAccess" - ) - cluster = clusters_factory(cluster_config) - remote_command_executor = RemoteCommandExecutor(cluster) - - _test_s3_access(remote_command_executor, region) - - if scheduler == "awsbatch": - _test_batch_access(remote_command_executor, region) - - assert_no_errors_in_logs(remote_command_executor, scheduler) - - -def _test_s3_access(remote_command_executor, region): - logging.info("Testing S3 Access") - result = remote_command_executor.run_remote_command("AWS_DEFAULT_REGION={0} aws s3 ls".format(region)).stdout - # An error occurred (AccessDenied) when calling the ListBuckets operation: Access Denied - assert_that(result).does_not_contain("AccessDenied") - - -def _test_batch_access(remote_command_executor, region): - logging.info("Testing AWS Batch Access") - result = remote_command_executor.run_remote_command( - "AWS_DEFAULT_REGION={0} aws batch describe-compute-environments".format(region) - ).stdout - # An error occurred (AccessDeniedException) when calling the DescribeComputeEnvironments operation: ... - assert_that(result).does_not_contain("AccessDeniedException") From 5f649f7b75803393a7e207afac4dd54e400ca6e4 Mon Sep 17 00:00:00 2001 From: chenwany Date: Thu, 10 Dec 2020 07:41:54 -0800 Subject: [PATCH 49/66] Add support for EBS gp3 volume type Signed-off-by: chenwany --- CHANGELOG.md | 5 +- cli/src/pcluster/config/cfn_param_types.py | 29 ++- cli/src/pcluster/config/mappings.py | 64 ++++-- cli/src/pcluster/config/validators.py | 46 +++- cli/tests/pcluster/config/defaults.py | 23 +- .../pcluster/config/test_section_cluster.py | 18 +- cli/tests/pcluster/config/test_section_ebs.py | 34 ++- .../pcluster.config.ini | 6 + .../pcluster/config/test_section_raid.py | 16 +- cli/tests/pcluster/config/test_validators.py | 68 +++++- .../test_slurm_sit_full/expected_output.ini | 4 +- .../test_slurm_sit_full/pcluster.config.ini | 3 +- .../expected_output.ini | 2 + .../pcluster.config.ini | 4 +- cloudformation/aws-parallelcluster.cfn.json | 22 +- cloudformation/cw-dashboard-substack.cfn.yaml | 4 +- cloudformation/ebs-substack.cfn.json | 216 +++++++++++++++++- cloudformation/raid-substack.cfn.json | 214 ++++++++++++++++- .../tests/storage/test_ebs.py | 2 +- .../test_ebs_multiple/pcluster.config.ini | 5 +- 20 files changed, 700 insertions(+), 85 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 35a0769fb4..feeec67117 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,12 +13,12 @@ CHANGELOG - EBS io2 is not supported in af-south-1 and eu-south-1 - Remove CloudFormation DescribeStacks API call from AWS Batch Docker entrypoint. This removes the possibility of job failures due to CloudFormation throttling. -- Add support for io2 EBS volume type. - Install EFA kernel module also on ARM instances with `alinux2` and `ubuntu1804` - Add `iam_lambda_role` parameter under `cluster` section to enable the possibility to specify an existing IAM role to be used by AWS Lambda functions in CloudFormation. When using `sge`, `torque`, or `slurm` as the scheduler, `pcluster` will not create any IAM role if both `ec2_iam_role` and `iam_lambda_role` are provided. +- Add support for io2 and gp3 EBS volume type. **CHANGES** @@ -38,7 +38,8 @@ CHANGELOG **BUG FIXES** - Mandate the presence of `vpc_settings`, `vpc_id`, `master_subnet_id` in the config file to avoid unhandled exceptions. - +- Set the default EBS volume size to 500 GiB when volume type is `st1` or `sc1`. + 2.10.0 ------ diff --git a/cli/src/pcluster/config/cfn_param_types.py b/cli/src/pcluster/config/cfn_param_types.py index 6a0bdca392..4239eb33ce 100644 --- a/cli/src/pcluster/config/cfn_param_types.py +++ b/cli/src/pcluster/config/cfn_param_types.py @@ -1317,5 +1317,32 @@ def refresh(self): ebs_snapshot_id = section.get_param_value("ebs_snapshot_id") default_volume_size = get_ebs_snapshot_info(ebs_snapshot_id).get("VolumeSize") else: - default_volume_size = 20 + default_volume_size = 500 if section.get_param_value("volume_type") in ["st1", "sc1"] else 20 self.value = default_volume_size + + +class VolumeIopsParam(IntCfnParam): + """Class to manage ebs volume_iops parameter in the EBS section.""" + + EBS_VOLUME_TYPE_IOPS_DEFAULT = { + "io1": 100, + "io2": 100, + "gp3": 3000, + } + + def refresh(self): + """ + We need this method to set different default value for ebs IOPS for different volume. + + Check whether the user have an input on ebs volume_iops when specify volume type to be "gp3". + For "gp3", the default iops is 3000. For other volumes, the default iops are 100. If the volume_iops is not + specified by an user, we will create an EBS volume with default volume + iops. + """ + section = self.pcluster_config.get_section(self.section_key, self.section_label) + + if section and section.get_param_value("volume_iops") is None: + volume_type = section.get_param_value("volume_type") + if volume_type in VolumeIopsParam.EBS_VOLUME_TYPE_IOPS_DEFAULT: + default_iops = VolumeIopsParam.EBS_VOLUME_TYPE_IOPS_DEFAULT.get(volume_type) + self.value = default_iops diff --git a/cli/src/pcluster/config/mappings.py b/cli/src/pcluster/config/mappings.py index 36e4575a54..3d9215760c 100644 --- a/cli/src/pcluster/config/mappings.py +++ b/cli/src/pcluster/config/mappings.py @@ -38,6 +38,7 @@ SpotBidPercentageCfnParam, SpotPriceCfnParam, TagsParam, + VolumeIopsParam, VolumeSizeParam, ) from pcluster.config.json_param_types import ( @@ -66,6 +67,7 @@ ebs_settings_validator, ebs_volume_iops_validator, ebs_volume_size_snapshot_validator, + ebs_volume_throughput_validator, ebs_volume_type_size_validator, ec2_ami_validator, ec2_iam_policies_validator, @@ -151,7 +153,7 @@ "snapshot_id": r"^snap-[0-9a-z]{8}$|^snap-[0-9a-z]{17}$", "subnet_id": r"^subnet-[0-9a-z]{8}$|^subnet-[0-9a-z]{17}$", "volume_id": r"^vol-[0-9a-z]{8}$|^vol-[0-9a-z]{17}$", - "volume_types": ["standard", "io1", "io2", "gp2", "st1", "sc1"], + "volume_types": ["standard", "io1", "io2", "gp2", "st1", "sc1", "gp3"], "vpc_id": r"^vpc-[0-9a-z]{8}$|^vpc-[0-9a-z]{17}$", "fsx_deployment_type": ["SCRATCH_1", "SCRATCH_2", "PERSISTENT_1"], "fsx_ssd_throughput": FSX_SSD_THROUGHPUT, @@ -312,20 +314,22 @@ "key": "ebs", "default_label": "default", "max_resources": 5, - "validators": [ebs_volume_type_size_validator, ebs_volume_iops_validator, ebs_volume_size_snapshot_validator], - "params": { - "shared_dir": { + "validators": [ebs_volume_type_size_validator, ebs_volume_iops_validator, ebs_volume_size_snapshot_validator, + ebs_volume_throughput_validator], + "params": OrderedDict([ # Use OrderedDict because the in python 3.5 a dict is not ordered by default, need it in + # the test of hit converter + ("shared_dir", { "allowed_values": ALLOWED_VALUES["file_path"], "cfn_param_mapping": "SharedDir", "validators": [shared_dir_validator], "update_policy": UpdatePolicy.UNSUPPORTED - }, - "ebs_snapshot_id": { + }), + ("ebs_snapshot_id", { "allowed_values": ALLOWED_VALUES["snapshot_id"], "cfn_param_mapping": "EBSSnapshotId", "update_policy": UpdatePolicy.UNSUPPORTED - }, - "volume_type": { + }), + ("volume_type", { "default": "gp2", "allowed_values": ALLOWED_VALUES["volume_types"], "cfn_param_mapping": "VolumeType", @@ -333,8 +337,8 @@ UpdatePolicy.UNSUPPORTED, action_needed=UpdatePolicy.ACTIONS_NEEDED["ebs_volume_update"] ) - }, - "volume_size": { + }), + ("volume_size", { "type": VolumeSizeParam, "cfn_param_mapping": "VolumeSize", "update_policy": UpdatePolicy( @@ -342,31 +346,36 @@ fail_reason=UpdatePolicy.FAIL_REASONS["ebs_volume_resize"], action_needed=UpdatePolicy.ACTIONS_NEEDED["ebs_volume_update"] ) - }, - "volume_iops": { - "type": IntCfnParam, - "default": 100, + }), + ("volume_iops", { + "type": VolumeIopsParam, "cfn_param_mapping": "VolumeIOPS", "update_policy": UpdatePolicy.SUPPORTED - }, - "encrypted": { + }), + ("encrypted", { "type": BoolCfnParam, "cfn_param_mapping": "EBSEncryption", "default": False, "update_policy": UpdatePolicy.UNSUPPORTED - }, - "ebs_kms_key_id": { + }), + ("ebs_kms_key_id", { "cfn_param_mapping": "EBSKMSKeyId", "validators": [kms_key_validator], "update_policy": UpdatePolicy.UNSUPPORTED - }, - "ebs_volume_id": { + }), + ("ebs_volume_id", { "cfn_param_mapping": "EBSVolumeId", "allowed_values": ALLOWED_VALUES["volume_id"], "validators": [ec2_volume_validator], "update_policy": UpdatePolicy.UNSUPPORTED - }, - }, + }), + ("volume_throughput", { + "type": IntCfnParam, + "cfn_param_mapping": "VolumeThroughput", + "update_policy": UpdatePolicy.SUPPORTED, + "default": 125 + }) + ]), } EFS = { @@ -419,7 +428,7 @@ "type": CfnSection, "key": "raid", "default_label": "default", - "validators": [ebs_volume_type_size_validator, ebs_volume_iops_validator], + "validators": [ebs_volume_type_size_validator, ebs_volume_iops_validator, ebs_volume_throughput_validator], "cfn_param_mapping": "RAIDOptions", # All the parameters in the section are converted into a single CFN parameter "params": OrderedDict( # Use OrderedDict because the parameters must respect the order in the CFN parameter [ @@ -451,8 +460,7 @@ "update_policy": UpdatePolicy.UNSUPPORTED }), ("volume_iops", { - "type": IntCfnParam, - "default": 100, + "type": VolumeIopsParam, "update_policy": UpdatePolicy.SUPPORTED }), ("encrypted", { @@ -464,6 +472,12 @@ "validators": [kms_key_validator], "update_policy": UpdatePolicy.UNSUPPORTED }), + ("volume_throughput", { + "type": IntCfnParam, + "default": 125, + "cfn_param_mapping": "VolumeThroughput", + "update_policy": UpdatePolicy.SUPPORTED + }), ] ) } diff --git a/cli/src/pcluster/config/validators.py b/cli/src/pcluster/config/validators.py index 8dda0800a4..e1b46ea143 100644 --- a/cli/src/pcluster/config/validators.py +++ b/cli/src/pcluster/config/validators.py @@ -78,10 +78,17 @@ "io1": (4, 16 * 1024), "io2": (4, 16 * 1024), "gp2": (1, 16 * 1024), + "gp3": (1, 16 * 1024), "st1": (500, 16 * 1024), "sc1": (500, 16 * 1024), } +EBS_VOLUME_IOPS_BOUNDS = { + "io1": (100, 64000), + "io2": (100, 64000), + "gp3": (3000, 16000), +} + HEAD_NODE_UNSUPPORTED_INSTANCE_TYPES = [] HEAD_NODE_UNSUPPORTED_MESSAGE = "The instance type '{0}' is not supported as head node." @@ -1325,7 +1332,7 @@ def ebs_volume_type_size_validator(section_key, section_label, pcluster_config): The default value of volume_size for EBS volumes is 20 GiB. The volume size of standard ranges from 1 GiB - 1 TiB(1024 GiB) - The volume size of gp2 ranges from 1 GiB - 16 TiB(16384 GiB) + The volume size of gp2 and gp3 ranges from 1 GiB - 16 TiB(16384 GiB) The volume size of io1 and io2 ranges from 4 GiB - 16 TiB(16384 GiB) The volume sizes of st1 and sc1 range from 500 GiB - 16 TiB(16384 GiB) """ @@ -1353,11 +1360,11 @@ def ebs_volume_iops_validator(section_key, section_label, pcluster_config): section = pcluster_config.get_section(section_key, section_label) volume_size = section.get_param_value("volume_size") volume_type = section.get_param_value("volume_type") - volume_type_to_iops_ratio = {"io1": 50, "io2": 500} + volume_type_to_iops_ratio = {"io1": 50, "io2": 500, "gp3": 500} volume_iops = section.get_param_value("volume_iops") - min_iops = 100 - max_iops = 64000 - if volume_type in volume_type_to_iops_ratio: + + if volume_type in EBS_VOLUME_IOPS_BOUNDS: + min_iops, max_iops = EBS_VOLUME_IOPS_BOUNDS.get(volume_type) if volume_iops and (volume_iops < min_iops or volume_iops > max_iops): errors.append( "IOPS rate must be between {min_iops} and {max_iops} when provisioning {volume_type} volumes.".format( @@ -1558,3 +1565,32 @@ def efa_os_arch_validator(param_key, param_value, pcluster_config): errors.append("EFA currently not supported on {0} for {1} architecture".format(base_os, architecture)) return errors, warnings + + +def ebs_volume_throughput_validator(section_key, section_label, pcluster_config): + errors = [] + warnings = [] + + section = pcluster_config.get_section(section_key, section_label) + volume_type = section.get_param_value("volume_type") + volume_iops = section.get_param_value("volume_iops") + volume_throughput = section.get_param_value("volume_throughput") + volume_throughput_to_iops_ratio = 0.25 + + if volume_type == "gp3": + min_throughput, max_throughput = 125, 1000 + if volume_throughput < min_throughput or volume_throughput > max_throughput: + errors.append( + "Throughput must be between {min_throughput} MB/s and {max_throughput} MB/s when provisioning " + "{volume_type} volumes.".format( + min_throughput=min_throughput, max_throughput=max_throughput, volume_type=volume_type + ) + ) + if volume_throughput and volume_throughput > volume_iops * volume_throughput_to_iops_ratio: + errors.append( + "Throughput to IOPS ratio of {0} is too high; maximum is 0.25.".format( + float(volume_throughput) / float(volume_iops) + ) + ) + + return errors, warnings diff --git a/cli/tests/pcluster/config/defaults.py b/cli/tests/pcluster/config/defaults.py index fec837315a..2aa3ef4741 100644 --- a/cli/tests/pcluster/config/defaults.py +++ b/cli/tests/pcluster/config/defaults.py @@ -40,10 +40,11 @@ "ebs_snapshot_id": None, "volume_type": "gp2", "volume_size": None, - "volume_iops": 100, + "volume_iops": None, "encrypted": False, "ebs_kms_key_id": None, "ebs_volume_id": None, + "volume_throughput": 125, } DEFAULT_EFS_DICT = { @@ -62,9 +63,10 @@ "num_of_raid_volumes": 2, "volume_type": "gp2", "volume_size": 20, - "volume_iops": 100, + "volume_iops": None, "encrypted": False, "ebs_kms_key_id": None, + "volume_throughput": 125, } DEFAULT_FSX_DICT = { @@ -228,7 +230,7 @@ class DefaultDict(Enum): # ------------------ Default CFN parameters ------------------ # # number of CFN parameters created by the PclusterConfig object. -CFN_SIT_CONFIG_NUM_OF_PARAMS = 62 +CFN_SIT_CONFIG_NUM_OF_PARAMS = 63 CFN_HIT_CONFIG_NUM_OF_PARAMS = 53 # CFN parameters created by the pcluster CLI @@ -254,15 +256,16 @@ class DefaultDict(Enum): "EBSSnapshotId": "NONE,NONE,NONE,NONE,NONE", "VolumeType": "gp2,gp2,gp2,gp2,gp2", "VolumeSize": "NONE,NONE,NONE,NONE,NONE", - "VolumeIOPS": "100,100,100,100,100", + "VolumeIOPS": "NONE,NONE,NONE,NONE,NONE", "EBSEncryption": "false,false,false,false,false", "EBSKMSKeyId": "NONE,NONE,NONE,NONE,NONE", "EBSVolumeId": "NONE,NONE,NONE,NONE,NONE", + "VolumeIdThroughput": "125,125,125,125,125", } DEFAULT_EFS_CFN_PARAMS = {"EFSOptions": "NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE"} -DEFAULT_RAID_CFN_PARAMS = {"RAIDOptions": "NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE"} +DEFAULT_RAID_CFN_PARAMS = {"RAIDOptions": "NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE"} DEFAULT_FSX_CFN_PARAMS = { "FSXOptions": "NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE" @@ -327,14 +330,15 @@ class DefaultDict(Enum): "EBSSnapshotId": "NONE,NONE,NONE,NONE,NONE", "VolumeType": "gp2,gp2,gp2,gp2,gp2", "VolumeSize": "NONE,NONE,NONE,NONE,NONE", - "VolumeIOPS": "100,100,100,100,100", + "VolumeIOPS": "NONE,NONE,NONE,NONE,NONE", "EBSEncryption": "false,false,false,false,false", "EBSKMSKeyId": "NONE,NONE,NONE,NONE,NONE", "EBSVolumeId": "NONE,NONE,NONE,NONE,NONE", + "VolumeThroughput": "125,125,125,125,125", # efs "EFSOptions": "NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE", # raid - "RAIDOptions": "NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE", + "RAIDOptions": "NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE", # fsx "FSXOptions": "NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE", # dcv @@ -397,14 +401,15 @@ class DefaultDict(Enum): "EBSSnapshotId": "NONE,NONE,NONE,NONE,NONE", "VolumeType": "gp2,gp2,gp2,gp2,gp2", "VolumeSize": "NONE,NONE,NONE,NONE,NONE", - "VolumeIOPS": "100,100,100,100,100", + "VolumeIOPS": "NONE,NONE,NONE,NONE,NONE", "EBSEncryption": "false,false,false,false,false", "EBSKMSKeyId": "NONE,NONE,NONE,NONE,NONE", "EBSVolumeId": "NONE,NONE,NONE,NONE,NONE", + "VolumeThroughput": "125,125,125,125,125", # efs "EFSOptions": "NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE", # raid - "RAIDOptions": "NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE", + "RAIDOptions": "NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE", # fsx "FSXOptions": "NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE", # dcv diff --git a/cli/tests/pcluster/config/test_section_cluster.py b/cli/tests/pcluster/config/test_section_cluster.py index b6f293767c..910f690397 100644 --- a/cli/tests/pcluster/config/test_section_cluster.py +++ b/cli/tests/pcluster/config/test_section_cluster.py @@ -1017,7 +1017,7 @@ def test_cluster_section_to_cfn( "SharedDir": "ebs1,NONE,NONE,NONE,NONE", "VolumeType": "io1,gp2,gp2,gp2,gp2", "VolumeSize": "40,NONE,NONE,NONE,NONE", - "VolumeIOPS": "200,100,100,100,100", + "VolumeIOPS": "200,NONE,NONE,NONE,NONE", "EBSEncryption": "true,false,false,false,false", "EBSKMSKeyId": "kms_key,NONE,NONE,NONE,NONE", "EBSVolumeId": "vol-12345678,NONE,NONE,NONE,NONE", @@ -1037,7 +1037,7 @@ def test_cluster_section_to_cfn( "SharedDir": "ebs1,ebs2,NONE,NONE,NONE", "VolumeType": "io1,standard,gp2,gp2,gp2", "VolumeSize": "40,30,NONE,NONE,NONE", - "VolumeIOPS": "200,300,100,100,100", + "VolumeIOPS": "200,300,NONE,NONE,NONE", "EBSEncryption": "true,false,false,false,false", "EBSKMSKeyId": "kms_key,NONE,NONE,NONE,NONE", "EBSVolumeId": "vol-12345678,NONE,NONE,NONE,NONE", @@ -1057,7 +1057,7 @@ def test_cluster_section_to_cfn( "SharedDir": "/shared", "VolumeType": "standard,gp2,gp2,gp2,gp2", "VolumeSize": "30,NONE,NONE,NONE,NONE", - "VolumeIOPS": "300,100,100,100,100", + "VolumeIOPS": "300,NONE,NONE,NONE,NONE", "EBSEncryption": "false,false,false,false,false", "EBSKMSKeyId": "NONE,NONE,NONE,NONE,NONE", "EBSVolumeId": "NONE,NONE,NONE,NONE,NONE", @@ -1077,7 +1077,7 @@ def test_cluster_section_to_cfn( "SharedDir": "/work", "VolumeType": "standard,gp2,gp2,gp2,gp2", "VolumeSize": "30,NONE,NONE,NONE,NONE", - "VolumeIOPS": "300,100,100,100,100", + "VolumeIOPS": "300,NONE,NONE,NONE,NONE", "EBSEncryption": "false,false,false,false,false", "EBSKMSKeyId": "NONE,NONE,NONE,NONE,NONE", "EBSVolumeId": "NONE,NONE,NONE,NONE,NONE", @@ -1097,7 +1097,7 @@ def test_cluster_section_to_cfn( "SharedDir": "ebs1,NONE,NONE,NONE,NONE", "VolumeType": "io1,gp2,gp2,gp2,gp2", "VolumeSize": "40,NONE,NONE,NONE,NONE", - "VolumeIOPS": "200,100,100,100,100", + "VolumeIOPS": "200,NONE,NONE,NONE,NONE", "EBSEncryption": "true,false,false,false,false", "EBSKMSKeyId": "kms_key,NONE,NONE,NONE,NONE", "EBSVolumeId": "vol-12345678,NONE,NONE,NONE,NONE", @@ -1125,14 +1125,14 @@ def test_cluster_section_to_cfn( "SharedDir": "ebs1,NONE,NONE,NONE,NONE", "VolumeType": "io1,gp2,gp2,gp2,gp2", "VolumeSize": "40,NONE,NONE,NONE,NONE", - "VolumeIOPS": "200,100,100,100,100", + "VolumeIOPS": "200,NONE,NONE,NONE,NONE", "EBSEncryption": "true,false,false,false,false", "EBSKMSKeyId": "kms_key,NONE,NONE,NONE,NONE", "EBSVolumeId": "vol-12345678,NONE,NONE,NONE,NONE", # efs "EFSOptions": "efs,NONE,generalPurpose,NONE,NONE,false,bursting,Valid,NONE", # raid - "RAIDOptions": "raid,NONE,2,gp2,20,100,false,NONE", + "RAIDOptions": "raid,NONE,2,gp2,20,NONE,false,NONE,125", # fsx "FSXOptions": "fsx,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE," "NONE,NONE", @@ -1191,14 +1191,14 @@ def test_cluster_section_to_cfn( "SharedDir": "ebs1,NONE,NONE,NONE,NONE", "VolumeType": "io1,gp2,gp2,gp2,gp2", "VolumeSize": "40,NONE,NONE,NONE,NONE", - "VolumeIOPS": "200,100,100,100,100", + "VolumeIOPS": "200,NONE,NONE,NONE,NONE", "EBSEncryption": "true,false,false,false,false", "EBSKMSKeyId": "kms_key,NONE,NONE,NONE,NONE", "EBSVolumeId": "vol-12345678,NONE,NONE,NONE,NONE", # efs "EFSOptions": "efs,NONE,generalPurpose,NONE,NONE,false,bursting,Valid,NONE", # raid - "RAIDOptions": "raid,NONE,2,gp2,20,100,false,NONE", + "RAIDOptions": "raid,NONE,2,gp2,20,NONE,false,NONE,125", # fsx "FSXOptions": "fsx,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE," "NONE,NONE", diff --git a/cli/tests/pcluster/config/test_section_ebs.py b/cli/tests/pcluster/config/test_section_ebs.py index 6bc0964839..1977605290 100644 --- a/cli/tests/pcluster/config/test_section_ebs.py +++ b/cli/tests/pcluster/config/test_section_ebs.py @@ -87,10 +87,11 @@ def test_ebs_section_to_file(mocker, section_dict, expected_config_parser_dict, "EBSSnapshotId": "NONE", "VolumeType": "gp2", "VolumeSize": "NONE", - "VolumeIOPS": "100", + "VolumeIOPS": "NONE", "EBSEncryption": "false", "EBSKMSKeyId": "NONE", "EBSVolumeId": "NONE", + "VolumeThroughput": "125", }, ), ( @@ -103,6 +104,7 @@ def test_ebs_section_to_file(mocker, section_dict, expected_config_parser_dict, "encrypted": True, "ebs_kms_key_id": "test", "ebs_volume_id": "test", + "volume_throughput": "125", }, { "SharedDir": "test", @@ -113,6 +115,7 @@ def test_ebs_section_to_file(mocker, section_dict, expected_config_parser_dict, "EBSEncryption": "true", "EBSKMSKeyId": "test", "EBSVolumeId": "test", + "VolumeThroughput": "125", }, ), ], @@ -165,12 +168,17 @@ def test_ebs_section_to_cfn(mocker, section_dict, expected_cfn_params): ("volume_size", "wrong_value", None, "must be an Integer"), ("volume_size", "10", 10, None), ("volume_size", "3", 3, None), - ("volume_iops", None, 100, None), + ("volume_iops", None, None, None), ("volume_iops", "", None, "must be an Integer"), ("volume_iops", "NONE", None, "must be an Integer"), ("volume_iops", "wrong_value", None, "must be an Integer"), ("volume_iops", "10", 10, None), ("volume_iops", "3", 3, None), + ("volume_throughput", None, 125, None), + ("volume_throughput", "", None, "must be an Integer"), + ("volume_throughput", "NONE", None, "must be an Integer"), + ("volume_throughput", "wrong_value", None, "must be an Integer"), + ("volume_throughput", "200", 200, None), ("encrypted", None, False, None), ("encrypted", "", None, "must be a Boolean"), ("encrypted", "NONE", None, "must be a Boolean"), @@ -207,10 +215,11 @@ def test_ebs_param_from_file(mocker, param_key, param_value, expected_value, exp "SharedDir": "ebs1,NONE,NONE,NONE,NONE", "VolumeType": "io1,gp2,gp2,gp2,gp2", "VolumeSize": "40,NONE,NONE,NONE,NONE", - "VolumeIOPS": "200,100,100,100,100", + "VolumeIOPS": "200,NONE,NONE,NONE,NONE", "EBSEncryption": "true,false,false,false,false", "EBSKMSKeyId": "kms_key,NONE,NONE,NONE,NONE", "EBSVolumeId": "vol-12345678,NONE,NONE,NONE,NONE", + "VolumeIOPS": "200,NONE,NONE,NONE,NONE", }, ), ), @@ -223,10 +232,27 @@ def test_ebs_param_from_file(mocker, param_key, param_value, expected_value, exp "SharedDir": "ebs2,NONE,NONE,NONE,NONE", "VolumeType": "standard,gp2,gp2,gp2,gp2", "VolumeSize": "30,NONE,NONE,NONE,NONE", - "VolumeIOPS": "300,100,100,100,100", + "VolumeIOPS": "300,NONE,NONE,NONE,NONE", + "EBSEncryption": "false,false,false,false,false", + "EBSKMSKeyId": "NONE,NONE,NONE,NONE,NONE", + "EBSVolumeId": "NONE,NONE,NONE,NONE,NONE", + }, + ), + ), + ( + "ebs3", + utils.merge_dicts( + DefaultCfnParams["cluster_sit"].value, + { + "NumberOfEBSVol": "1", + "SharedDir": "ebs3,NONE,NONE,NONE,NONE", + "VolumeType": "gp3,gp2,gp2,gp2,gp2", + "VolumeSize": "30,NONE,NONE,NONE,NONE", + "VolumeIOPS": "3000,NONE,NONE,NONE,NONE", "EBSEncryption": "false,false,false,false,false", "EBSKMSKeyId": "NONE,NONE,NONE,NONE,NONE", "EBSVolumeId": "NONE,NONE,NONE,NONE,NONE", + "VolumeThroughput": "150,125,125,125,125", }, ), ), diff --git a/cli/tests/pcluster/config/test_section_ebs/test_ebs_from_file_to_cfn/pcluster.config.ini b/cli/tests/pcluster/config/test_section_ebs/test_ebs_from_file_to_cfn/pcluster.config.ini index 761f3ca45b..37217fe2f3 100644 --- a/cli/tests/pcluster/config/test_section_ebs/test_ebs_from_file_to_cfn/pcluster.config.ini +++ b/cli/tests/pcluster/config/test_section_ebs/test_ebs_from_file_to_cfn/pcluster.config.ini @@ -27,3 +27,9 @@ volume_size = 30 volume_iops = 300 encrypted = false +[ebs ebs3] +shared_dir = ebs3 +volume_type = gp3 +volume_size = 30 +volume_iops = 3000 +volume_throughput =150 diff --git a/cli/tests/pcluster/config/test_section_raid.py b/cli/tests/pcluster/config/test_section_raid.py index 0f56accaa6..e483ec205f 100644 --- a/cli/tests/pcluster/config/test_section_raid.py +++ b/cli/tests/pcluster/config/test_section_raid.py @@ -20,19 +20,20 @@ [ (DefaultCfnParams["raid"].value, DefaultDict["raid"].value), ({}, DefaultDict["raid"].value), - ({"RAIDOptions": "NONE, NONE, NONE, NONE, NONE, NONE, NONE, NONE"}, DefaultDict["raid"].value), - ({"RAIDOptions": "NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE"}, DefaultDict["raid"].value), + ({"RAIDOptions": "NONE, NONE, NONE, NONE, NONE, NONE, NONE, NONE, NONE"}, DefaultDict["raid"].value), + ({"RAIDOptions": "NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE"}, DefaultDict["raid"].value), ( - {"RAIDOptions": "test,NONE,NONE,NONE,NONE,NONE,NONE,NONE"}, + {"RAIDOptions": "test,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE"}, { "shared_dir": "test", "raid_type": None, "num_of_raid_volumes": 2, "volume_type": "gp2", "volume_size": 20, - "volume_iops": 100, + "volume_iops": None, "encrypted": False, "ebs_kms_key_id": None, + "volume_throughput": 125, }, ), ( @@ -79,7 +80,7 @@ def test_raid_section_from_file(mocker, config_parser_dict, expected_dict_params # default ({}, {"raid default": {}}, None), # default values - ({"volume_iops": 100}, {"raid default": {"volume_iops": "100"}}, "No section.*"), + ({"volume_throughput": 125}, {"raid default": {"volume_throughput": "125"}}, "No section.*"), ({"encrypted": False}, {"raid default": {"encrypted": "false"}}, "No section.*"), # other values ({"volume_iops": 120}, {"raid default": {"volume_iops": "120"}}, None), @@ -144,7 +145,7 @@ def test_raid_section_to_cfn(mocker, section_dict, expected_cfn_params): ("volume_size", "wrong_value", None, "must be an Integer"), ("volume_size", "10", 10, None), ("volume_size", "3", 3, None), - ("volume_iops", None, 100, None), + ("volume_iops", None, None, None), ("volume_iops", "", None, "must be an Integer"), ("volume_iops", "NONE", None, "must be an Integer"), ("volume_iops", "wrong_value", None, "must be an Integer"), @@ -160,6 +161,9 @@ def test_raid_section_to_cfn(mocker, section_dict, expected_cfn_params): ("ebs_kms_key_id", "fake_value", "fake_value", None), ("ebs_kms_key_id", "test", "test", None), ("ebs_kms_key_id", "NONE", "NONE", None), # NONE is evaluated as a valid kms id + ("volume_throughput", "NONE", None, "must be an Integer"), + ("volume_throughput", "wrong_value", None, "must be an Integer"), + ("volume_throughput", "150", 150, None), ], ) def test_raid_param_from_file(mocker, param_key, param_value, expected_value, expected_message): diff --git a/cli/tests/pcluster/config/test_validators.py b/cli/tests/pcluster/config/test_validators.py index ff3736e47f..e87b5b8ef3 100644 --- a/cli/tests/pcluster/config/test_validators.py +++ b/cli/tests/pcluster/config/test_validators.py @@ -694,7 +694,7 @@ def test_efs_validator(mocker, section_dict, expected_message): {"volume_type": "io1", "volume_size": 20, "volume_iops": 64001}, "IOPS rate must be between 100 and 64000 when provisioning io1 volumes.", ), - ({"volume_type": "io1", "volume_size": 20, "volume_iops": 1001}, "IOPS to volume size ratio of .* is too hig"), + ({"volume_type": "io1", "volume_size": 20, "volume_iops": 1001}, "IOPS to volume size ratio of .* is too high"), ({"volume_type": "io2", "volume_size": 20, "volume_iops": 120}, None), ( {"volume_type": "io2", "volume_size": 20, "volume_iops": 90}, @@ -704,7 +704,23 @@ def test_efs_validator(mocker, section_dict, expected_message): {"volume_type": "io2", "volume_size": 20, "volume_iops": 64001}, "IOPS rate must be between 100 and 64000 when provisioning io2 volumes.", ), - ({"volume_type": "io2", "volume_size": 20, "volume_iops": 10001}, "IOPS to volume size ratio of .* is too hig"), + ( + {"volume_type": "io2", "volume_size": 20, "volume_iops": 10001}, + "IOPS to volume size ratio of .* is too high", + ), + ({"volume_type": "gp3", "volume_size": 20, "volume_iops": 3000}, None), + ( + {"volume_type": "gp3", "volume_size": 20, "volume_iops": 2900}, + "IOPS rate must be between 3000 and 16000 when provisioning gp3 volumes.", + ), + ( + {"volume_type": "gp3", "volume_size": 20, "volume_iops": 16001}, + "IOPS rate must be between 3000 and 16000 when provisioning gp3 volumes.", + ), + ( + {"volume_type": "gp3", "volume_size": 20, "volume_iops": 10001}, + "IOPS to volume size ratio of .* is too high", + ), ], ) def test_raid_validators(mocker, section_dict, expected_message): @@ -2437,6 +2453,9 @@ def test_fsx_ignored_parameters_validator(mocker, section_dict, expected_error): ({"volume_type": "gp2", "volume_size": 15}, None), ({"volume_type": "gp2", "volume_size": 0}, "The size of gp2 volumes must be at least 1 GiB"), ({"volume_type": "gp2", "volume_size": 16385}, "The size of gp2 volumes can not exceed 16384 GiB"), + ({"volume_type": "gp3", "volume_size": 15}, None), + ({"volume_type": "gp3", "volume_size": 0}, "The size of gp3 volumes must be at least 1 GiB"), + ({"volume_type": "gp3", "volume_size": 16385}, "The size of gp3 volumes can not exceed 16384 GiB"), ({"volume_type": "st1", "volume_size": 500}, None), ({"volume_type": "st1", "volume_size": 20}, "The size of st1 volumes must be at least 500 GiB"), ({"volume_type": "st1", "volume_size": 16385}, "The size of st1 volumes can not exceed 16384 GiB"), @@ -2470,7 +2489,7 @@ def test_ebs_allowed_values_all_have_volume_size_bounds(): {"volume_type": "io1", "volume_size": 20, "volume_iops": 64001}, "IOPS rate must be between 100 and 64000 when provisioning io1 volumes.", ), - ({"volume_type": "io1", "volume_size": 20, "volume_iops": 1001}, "IOPS to volume size ratio of .* is too hig"), + ({"volume_type": "io1", "volume_size": 20, "volume_iops": 1001}, "IOPS to volume size ratio of .* is too high"), ({"volume_type": "io2", "volume_size": 20, "volume_iops": 120}, None), ( {"volume_type": "io2", "volume_size": 20, "volume_iops": 90}, @@ -2480,7 +2499,23 @@ def test_ebs_allowed_values_all_have_volume_size_bounds(): {"volume_type": "io2", "volume_size": 20, "volume_iops": 64001}, "IOPS rate must be between 100 and 64000 when provisioning io2 volumes.", ), - ({"volume_type": "io2", "volume_size": 20, "volume_iops": 10001}, "IOPS to volume size ratio of .* is too hig"), + ( + {"volume_type": "io2", "volume_size": 20, "volume_iops": 10001}, + "IOPS to volume size ratio of .* is too high", + ), + ({"volume_type": "gp3", "volume_size": 20, "volume_iops": 3000}, None), + ( + {"volume_type": "gp3", "volume_size": 20, "volume_iops": 2900}, + "IOPS rate must be between 3000 and 16000 when provisioning gp3 volumes.", + ), + ( + {"volume_type": "gp3", "volume_size": 20, "volume_iops": 16001}, + "IOPS rate must be between 3000 and 16000 when provisioning gp3 volumes.", + ), + ( + {"volume_type": "gp3", "volume_size": 20, "volume_iops": 10001}, + "IOPS to volume size ratio of .* is too high", + ), ], ) def test_ebs_volume_iops_validator(mocker, section_dict, expected_message): @@ -2711,3 +2746,28 @@ def test_efa_os_arch_validator(mocker, cluster_dict, architecture, expected_erro assert_that(errors[0]).matches(expected_error) else: assert_that(errors).is_empty() + + +@pytest.mark.parametrize( + "section_dict, expected_message", + [ + ({"volume_type": "gp3", "volume_throughput": 125}, None), + ( + {"volume_type": "gp3", "volume_throughput": 100}, + "Throughput must be between 125 MB/s and 1000 MB/s when provisioning gp3 volumes.", + ), + ( + {"volume_type": "gp3", "volume_throughput": 1001}, + "Throughput must be between 125 MB/s and 1000 MB/s when provisioning gp3 volumes.", + ), + ({"volume_type": "gp3", "volume_throughput": 125, "volume_iops": 3000}, None), + ( + {"volume_type": "gp3", "volume_throughput": 760, "volume_iops": 3000}, + "Throughput to IOPS ratio of .* is too high", + ), + ({"volume_type": "gp3", "volume_throughput": 760, "volume_iops": 10000}, None), + ], +) +def test_ebs_volume_throughput_validator(mocker, section_dict, expected_message): + config_parser_dict = {"cluster default": {"ebs_settings": "default"}, "ebs default": section_dict} + utils.assert_param_validator(mocker, config_parser_dict, expected_message) diff --git a/cli/tests/pcluster_config/test_pcluster_config_convert/test_slurm_sit_full/expected_output.ini b/cli/tests/pcluster_config/test_pcluster_config_convert/test_slurm_sit_full/expected_output.ini index 4aa0eafb7c..c3c7369bda 100644 --- a/cli/tests/pcluster_config/test_pcluster_config_convert/test_slurm_sit_full/expected_output.ini +++ b/cli/tests/pcluster_config/test_pcluster_config_convert/test_slurm_sit_full/expected_output.ini @@ -21,7 +21,9 @@ fsx_fs_id = fs-0aaae053900f84047 [ebs settings1] shared_dir = sharedebs -volume_size = 50 +volume_type = gp3 +volume_size = 20 +volume_iops = 3000 [ebs settings2] shared_dir = sharedebs2 diff --git a/cli/tests/pcluster_config/test_pcluster_config_convert/test_slurm_sit_full/pcluster.config.ini b/cli/tests/pcluster_config/test_pcluster_config_convert/test_slurm_sit_full/pcluster.config.ini index 03342acd51..1e98f059ef 100644 --- a/cli/tests/pcluster_config/test_pcluster_config_convert/test_slurm_sit_full/pcluster.config.ini +++ b/cli/tests/pcluster_config/test_pcluster_config_convert/test_slurm_sit_full/pcluster.config.ini @@ -61,8 +61,7 @@ fsx_fs_id = fs-0aaae053900f84047 [ebs settings1] shared_dir = sharedebs -volume_type = gp2 -volume_size = 50 +volume_type = gp3 [ebs settings2] shared_dir = sharedebs2 diff --git a/cli/tests/pcluster_config/test_pcluster_config_convert/test_slurm_unrelated_sections/expected_output.ini b/cli/tests/pcluster_config/test_pcluster_config_convert/test_slurm_unrelated_sections/expected_output.ini index 1baaa68787..a4e5b5a8fe 100644 --- a/cli/tests/pcluster_config/test_pcluster_config_convert/test_slurm_unrelated_sections/expected_output.ini +++ b/cli/tests/pcluster_config/test_pcluster_config_convert/test_slurm_unrelated_sections/expected_output.ini @@ -25,7 +25,9 @@ volume_size = 50 [ebs settings2] shared_dir = sharedebs2 +volume_type = gp3 volume_size = 10 +volume_iops = 3500 [raid settings1] shared_dir = /raid_dir diff --git a/cli/tests/pcluster_config/test_pcluster_config_convert/test_slurm_unrelated_sections/pcluster.config.ini b/cli/tests/pcluster_config/test_pcluster_config_convert/test_slurm_unrelated_sections/pcluster.config.ini index c6c1e5b638..904cee0961 100644 --- a/cli/tests/pcluster_config/test_pcluster_config_convert/test_slurm_unrelated_sections/pcluster.config.ini +++ b/cli/tests/pcluster_config/test_pcluster_config_convert/test_slurm_unrelated_sections/pcluster.config.ini @@ -62,13 +62,13 @@ fsx_fs_id = fs-0aaae053900f84047 [ebs settings1] shared_dir = sharedebs -volume_type = gp2 volume_size = 50 [ebs settings2] shared_dir = sharedebs2 -volume_type = gp2 +volume_type = gp3 volume_size = 10 +volume_iops = 3500 [raid settings1] shared_dir = /raid_dir diff --git a/cloudformation/aws-parallelcluster.cfn.json b/cloudformation/aws-parallelcluster.cfn.json index 3ecbb288e4..9856065201 100644 --- a/cloudformation/aws-parallelcluster.cfn.json +++ b/cloudformation/aws-parallelcluster.cfn.json @@ -67,8 +67,8 @@ "Description": "Comma delimited list of type of volume to create either new or from snapshot", "Type": "String", "Default": "gp2,gp2,gp2,gp2,gp2", - "ConstraintDescription": "must be a supported volume type: standard, io1, io2, gp2, st1, sc1", - "AllowedPattern": "^(NONE|standard|io1|io2|gp2|st1|sc1)((,|, )(NONE|standard|io1|io2|gp2|st1|sc1)){4}$" + "ConstraintDescription": "must be a supported volume type: standard, io1, io2, gp2, gp3, st1, sc1", + "AllowedPattern": "^(NONE|standard|io1|io2|gp2|gp3|st1|sc1)((,|, )(NONE|standard|io1|io2|gp2|gp3|st1|sc1)){4}$" }, "MasterSubnetId": { "Description": "ID of the Subnet you want to provision the head node into", @@ -121,9 +121,14 @@ ] }, "VolumeIOPS": { - "Description": "Comma delimited list of number of IOPS for volume type io1 and io2. Not used for other volume types.", + "Description": "Comma delimited list of number of IOPS for volume type io1, io2 and gp3. Not used for other volume types.", "Type": "String", - "Default": "100,100,100,100,100" + "Default": "NONE,NONE,NONE,NONE,NONE" + }, + "VolumeThroughput": { + "Description": "Comma delimited list of number of Throughtput for volume type gp3. Not used for other volume types.", + "Type": "String", + "Default": "125,125,125,125,125" }, "PreInstallScript": { "Description": "Preinstall script URL. This is run before any host configuration.", @@ -320,10 +325,10 @@ "Type": "String" }, "RAIDOptions": { - "Description": "Comma Separated List of RAID related options, 8 parameters in total, [shared_dir,raid_type,num_of_raid_volumes,volume_type,volume_size,volume_iops,encrypted,ebs_kms_key_id]", + "Description": "Comma Separated List of RAID related options, 9 parameters in total, [shared_dir,raid_type,num_of_raid_volumes,volume_type,volume_size,volume_iops,encrypted,ebs_kms_key_id,volume_throughput]", "Type": "String", - "Default": "NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE", - "AllowedPattern": "^(NONE|.+)(,|, )(NONE|\\d)(,|, )(NONE|\\d)(,|, )(NONE|standard|io1|io2|gp2|st1|sc1)(,|, )(NONE|\\d+)(,|, )(NONE|\\d+)(,|, )(NONE|true|false)(,|, )(NONE|.+)$" + "Default": "NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE", + "AllowedPattern": "^(NONE|.+)(,|, )(NONE|\\d)(,|, )(NONE|\\d)(,|, )(NONE|standard|io1|io2|gp2|gp3|st1|sc1)(,|, )(NONE|\\d+)(,|, )(NONE|\\d+)(,|, )(NONE|true|false)(,|, )(NONE|.+)(,|, )(NONE|\\d+)$" }, "NumberOfEBSVol": { "Description": "Number of EBS Volumes the user requested, up to 5", @@ -2674,6 +2679,9 @@ "VolumeIOPS": { "Ref": "VolumeIOPS" }, + "VolumeThroughput": { + "Ref": "VolumeThroughput" + }, "EBSEncryption": { "Ref": "EBSEncryption" }, diff --git a/cloudformation/cw-dashboard-substack.cfn.yaml b/cloudformation/cw-dashboard-substack.cfn.yaml index 04d7a9309f..fcaf288b50 100644 --- a/cloudformation/cw-dashboard-substack.cfn.yaml +++ b/cloudformation/cw-dashboard-substack.cfn.yaml @@ -144,8 +144,8 @@ Resources: {%- endfor %} {#- Conditional EBS metrics #} - {%- set ebs_metrics_conditions = [{'metric': 'VolumeConsumedReadWriteOps', 'supported_vol_types': ["io1", "io2"], 'extra_params': ['"title":"Consumed Read/Write Ops"']}, - {'metric': 'VolumeThroughputPercentage', 'supported_vol_types': ["io1", "io2"], 'extra_params': ['"title":"Throughput Percentage"']}, + {%- set ebs_metrics_conditions = [{'metric': 'VolumeConsumedReadWriteOps', 'supported_vol_types': ["io1", "io2", "gp3"], 'extra_params': ['"title":"Consumed Read/Write Ops"']}, + {'metric': 'VolumeThroughputPercentage', 'supported_vol_types': ["io1", "io2", "gp3"], 'extra_params': ['"title":"Throughput Percentage"']}, {'metric': 'BurstBalance', 'supported_vol_types': ["gp2", "st1", "sc1"], 'extra_params': ['"title":"Burst Balance"']}] %} {%- for metric_condition_params in ebs_metrics_conditions %} diff --git a/cloudformation/ebs-substack.cfn.json b/cloudformation/ebs-substack.cfn.json index f50663ebc0..e2f80ff92e 100644 --- a/cloudformation/ebs-substack.cfn.json +++ b/cloudformation/ebs-substack.cfn.json @@ -121,6 +121,19 @@ }, "Vol1_UseEBSPIOPS": { "Fn::Or": [ + { + "Fn::Equals": [ + { + "Fn::Select": [ + "0", + { + "Ref": "VolumeType" + } + ] + }, + "gp3" + ] + }, { "Fn::Equals": [ { @@ -149,6 +162,19 @@ } ] }, + "Vol1_UseEBSThroughput": { + "Fn::Equals": [ + { + "Fn::Select": [ + "0", + { + "Ref": "VolumeType" + } + ] + }, + "gp3" + ] + }, "Vol1_UseEBSSnapshot": { "Fn::Not": [ { @@ -276,6 +302,19 @@ }, "Vol2_UseEBSPIOPS": { "Fn::Or": [ + { + "Fn::Equals": [ + { + "Fn::Select": [ + "1", + { + "Ref": "VolumeType" + } + ] + }, + "gp3" + ] + }, { "Fn::Equals": [ { @@ -304,6 +343,19 @@ } ] }, + "Vol2_UseEBSThroughput": { + "Fn::Equals": [ + { + "Fn::Select": [ + "1", + { + "Ref": "VolumeType" + } + ] + }, + "gp3" + ] + }, "Vol2_UseEBSSnapshot": { "Fn::Not": [ { @@ -431,6 +483,19 @@ }, "Vol3_UseEBSPIOPS": { "Fn::Or": [ + { + "Fn::Equals": [ + { + "Fn::Select": [ + "2", + { + "Ref": "VolumeType" + } + ] + }, + "gp3" + ] + }, { "Fn::Equals": [ { @@ -459,6 +524,19 @@ } ] }, + "Vol3_UseEBSThroughput": { + "Fn::Equals": [ + { + "Fn::Select": [ + "2", + { + "Ref": "VolumeType" + } + ] + }, + "gp3" + ] + }, "Vol3_UseEBSSnapshot": { "Fn::Not": [ { @@ -586,6 +664,19 @@ }, "Vol4_UseEBSPIOPS": { "Fn::Or": [ + { + "Fn::Equals": [ + { + "Fn::Select": [ + "3", + { + "Ref": "VolumeType" + } + ] + }, + "gp3" + ] + }, { "Fn::Equals": [ { @@ -614,6 +705,19 @@ } ] }, + "Vol4_UseEBSThroughput": { + "Fn::Equals": [ + { + "Fn::Select": [ + "3", + { + "Ref": "VolumeType" + } + ] + }, + "gp3" + ] + }, "Vol4_UseEBSSnapshot": { "Fn::Not": [ { @@ -741,6 +845,19 @@ }, "Vol5_UseEBSPIOPS": { "Fn::Or": [ + { + "Fn::Equals": [ + { + "Fn::Select": [ + "4", + { + "Ref": "VolumeType" + } + ] + }, + "gp3" + ] + }, { "Fn::Equals": [ { @@ -769,6 +886,19 @@ } ] }, + "Vol5_UseEBSThroughput": { + "Fn::Equals": [ + { + "Fn::Select": [ + "4", + { + "Ref": "VolumeType" + } + ] + }, + "gp3" + ] + }, "Vol5_UseEBSSnapshot": { "Fn::Not": [ { @@ -1157,7 +1287,11 @@ "Type": "Number" }, "VolumeIOPS": { - "Description": "Number of IOPS for volume type io1 and io2. Not used for other volume types.", + "Description": "Number of IOPS for volume type io1, io2 and gp3. Not used for other volume types.", + "Type": "CommaDelimitedList" + }, + "VolumeThroughput": { + "Description": "Throughput for volume type gp3. Not used for other volume types.", "Type": "CommaDelimitedList" }, "VolumeSize": { @@ -1208,6 +1342,22 @@ } ] }, + "Throughput": { + "Fn::If": [ + "Vol1_UseEBSThroughput", + { + "Fn::Select": [ + "0", + { + "Ref": "VolumeThroughput" + } + ] + }, + { + "Ref": "AWS::NoValue" + } + ] + }, "KmsKeyId": { "Fn::If": [ "Vol1_UseEBSKMSKey", @@ -1309,6 +1459,22 @@ } ] }, + "Throughput": { + "Fn::If": [ + "Vol2_UseEBSThroughput", + { + "Fn::Select": [ + "1", + { + "Ref": "VolumeThroughput" + } + ] + }, + { + "Ref": "AWS::NoValue" + } + ] + }, "KmsKeyId": { "Fn::If": [ "Vol2_UseEBSKMSKey", @@ -1410,6 +1576,22 @@ } ] }, + "Throughput": { + "Fn::If": [ + "Vol3_UseEBSThroughput", + { + "Fn::Select": [ + "2", + { + "Ref": "VolumeThroughput" + } + ] + }, + { + "Ref": "AWS::NoValue" + } + ] + }, "KmsKeyId": { "Fn::If": [ "Vol3_UseEBSKMSKey", @@ -1511,6 +1693,22 @@ } ] }, + "Throughput": { + "Fn::If": [ + "Vol4_UseEBSThroughput", + { + "Fn::Select": [ + "3", + { + "Ref": "VolumeThroughput" + } + ] + }, + { + "Ref": "AWS::NoValue" + } + ] + }, "KmsKeyId": { "Fn::If": [ "Vol4_UseEBSKMSKey", @@ -1612,6 +1810,22 @@ } ] }, + "Throughput": { + "Fn::If": [ + "Vol5_UseEBSThroughput", + { + "Fn::Select": [ + "4", + { + "Ref": "VolumeThroughput" + } + ] + }, + { + "Ref": "AWS::NoValue" + } + ] + }, "KmsKeyId": { "Fn::If": [ "Vol5_UseEBSKMSKey", diff --git a/cloudformation/raid-substack.cfn.json b/cloudformation/raid-substack.cfn.json index 41311f3578..8fc1286b2d 100644 --- a/cloudformation/raid-substack.cfn.json +++ b/cloudformation/raid-substack.cfn.json @@ -152,6 +152,19 @@ }, "Vol1_UseEBSPIOPS": { "Fn::Or": [ + { + "Fn::Equals": [ + { + "Fn::Select": [ + "3", + { + "Ref": "RAIDOptions" + } + ] + }, + "gp3" + ] + }, { "Fn::Equals": [ { @@ -180,6 +193,19 @@ } ] }, + "Vol1_UseEBSThroughput": { + "Fn::Equals": [ + { + "Fn::Select": [ + "3", + { + "Ref": "RAIDOptions" + } + ] + }, + "gp3" + ] + }, "Vol1_UseVolumeSize": { "Fn::Not": [ { @@ -253,6 +279,19 @@ }, "Vol2_UseEBSPIOPS": { "Fn::Or": [ + { + "Fn::Equals": [ + { + "Fn::Select": [ + "3", + { + "Ref": "RAIDOptions" + } + ] + }, + "gp3" + ] + }, { "Fn::Equals": [ { @@ -281,6 +320,19 @@ } ] }, + "Vol2_UseEBSThroughput": { + "Fn::Equals": [ + { + "Fn::Select": [ + "3", + { + "Ref": "RAIDOptions" + } + ] + }, + "gp3" + ] + }, "Vol2_UseVolumeSize": { "Fn::Not": [ { @@ -354,6 +406,19 @@ }, "Vol3_UseEBSPIOPS": { "Fn::Or": [ + { + "Fn::Equals": [ + { + "Fn::Select": [ + "3", + { + "Ref": "RAIDOptions" + } + ] + }, + "gp3" + ] + }, { "Fn::Equals": [ { @@ -382,6 +447,19 @@ } ] }, + "Vol3_UseEBSThroughput": { + "Fn::Equals": [ + { + "Fn::Select": [ + "3", + { + "Ref": "RAIDOptions" + } + ] + }, + "gp3" + ] + }, "Vol3_UseVolumeSize": { "Fn::Not": [ { @@ -455,6 +533,19 @@ }, "Vol4_UseEBSPIOPS": { "Fn::Or": [ + { + "Fn::Equals": [ + { + "Fn::Select": [ + "3", + { + "Ref": "RAIDOptions" + } + ] + }, + "gp3" + ] + }, { "Fn::Equals": [ { @@ -483,6 +574,19 @@ } ] }, + "Vol4_UseEBSThroughput": { + "Fn::Equals": [ + { + "Fn::Select": [ + "3", + { + "Ref": "RAIDOptions" + } + ] + }, + "gp3" + ] + }, "Vol4_UseVolumeSize": { "Fn::Not": [ { @@ -556,6 +660,19 @@ }, "Vol5_UseEBSPIOPS": { "Fn::Or": [ + { + "Fn::Equals": [ + { + "Fn::Select": [ + "3", + { + "Ref": "RAIDOptions" + } + ] + }, + "gp3" + ] + }, { "Fn::Equals": [ { @@ -584,6 +701,19 @@ } ] }, + "Vol5_UseEBSThroughput": { + "Fn::Equals": [ + { + "Fn::Select": [ + "3", + { + "Ref": "RAIDOptions" + } + ] + }, + "gp3" + ] + }, "Vol5_UseVolumeSize": { "Fn::Not": [ { @@ -728,8 +858,8 @@ "Description": "Availability Zone the cluster will launch into. THIS IS REQUIRED", "Type": "AWS::EC2::AvailabilityZone::Name" }, - "RAIDOptions": { - "Description": "Comma separated list of RAID related options, 8 parameters in total, [0 shared_dir,1 raid_type,2 num_of_vols,3 vol_type,4 vol_size,5 vol_IOPS,6 encrypted, 7 ebs_kms_key]", + "RAIDOptions":{ + "Description": "Comma separated list of RAID related options, 9 parameters in total, [0 shared_dir,1 raid_type,2 num_of_vols,3 vol_type,4 vol_size,5 vol_IOPS,6 encrypted, 7 ebs_kms_key, 8 volume_throughput]", "Type": "CommaDelimitedList" } }, @@ -772,6 +902,22 @@ } ] }, + "Throughput": { + "Fn::If": [ + "Vol1_UseEBSThroughput", + { + "Fn::Select": [ + "8", + { + "Ref": "RAIDOptions" + } + ] + }, + { + "Ref": "AWS::NoValue" + } + ] + }, "KmsKeyId": { "Fn::If": [ "Vol1_UseEBSKMSKey", @@ -857,6 +1003,22 @@ } ] }, + "Throughput": { + "Fn::If": [ + "Vol2_UseEBSThroughput", + { + "Fn::Select": [ + "8", + { + "Ref": "RAIDOptions" + } + ] + }, + { + "Ref": "AWS::NoValue" + } + ] + }, "KmsKeyId": { "Fn::If": [ "Vol2_UseEBSKMSKey", @@ -942,6 +1104,22 @@ } ] }, + "Throughput": { + "Fn::If": [ + "Vol3_UseEBSThroughput", + { + "Fn::Select": [ + "8", + { + "Ref": "RAIDOptions" + } + ] + }, + { + "Ref": "AWS::NoValue" + } + ] + }, "KmsKeyId": { "Fn::If": [ "Vol3_UseEBSKMSKey", @@ -1027,6 +1205,22 @@ } ] }, + "Throughput": { + "Fn::If": [ + "Vol4_UseEBSThroughput", + { + "Fn::Select": [ + "8", + { + "Ref": "RAIDOptions" + } + ] + }, + { + "Ref": "AWS::NoValue" + } + ] + }, "KmsKeyId": { "Fn::If": [ "Vol4_UseEBSKMSKey", @@ -1112,6 +1306,22 @@ } ] }, + "Throughput": { + "Fn::If": [ + "Vol5_UseEBSThroughput", + { + "Fn::Select": [ + "8", + { + "Ref": "RAIDOptions" + } + ] + }, + { + "Ref": "AWS::NoValue" + } + ] + }, "KmsKeyId": { "Fn::If": [ "Vol5_UseEBSKMSKey", diff --git a/tests/integration-tests/tests/storage/test_ebs.py b/tests/integration-tests/tests/storage/test_ebs.py index 48a5cf1553..1d26eaa5af 100644 --- a/tests/integration-tests/tests/storage/test_ebs.py +++ b/tests/integration-tests/tests/storage/test_ebs.py @@ -115,7 +115,7 @@ def test_ebs_multiple(scheduler, pcluster_config_reader, clusters_factory, regio assert_that(volume[0]).is_equal_to(volume_type) # test different iops # only the iops of io1 and io2 can be configured by us - if volume_type == "io1" or volume_type == "io2": + if volume_type in ["io1", "io2", "gp3"]: volume_iops = cluster.config.get("ebs ebs{0}".format(i + 1), "volume_iops") assert_that(volume[1]).is_equal_to(int(volume_iops)) diff --git a/tests/integration-tests/tests/storage/test_ebs/test_ebs_multiple/pcluster.config.ini b/tests/integration-tests/tests/storage/test_ebs/test_ebs_multiple/pcluster.config.ini index c2ff5023d7..cd6ac59241 100644 --- a/tests/integration-tests/tests/storage/test_ebs/test_ebs_multiple/pcluster.config.ini +++ b/tests/integration-tests/tests/storage/test_ebs/test_ebs_multiple/pcluster.config.ini @@ -28,10 +28,11 @@ use_public_ips = false [ebs ebs1] shared_dir = {{ mount_dirs[0] }} -volume_type = io1 +volume_type = gp3 volume_size = {{ volume_sizes[0] }} -volume_iops = 100 +volume_iops = 3200 encrypted = true +volume_throughput = 130 [ebs ebs2] shared_dir = {{ mount_dirs[1] }} From a99ad132ca2cf3d7b19a89aa7eb01a06e2236cfb Mon Sep 17 00:00:00 2001 From: Francesco De Martino Date: Thu, 17 Dec 2020 09:53:39 +0100 Subject: [PATCH 50/66] Enable FSx Lustre tests in GovCloud Signed-off-by: Francesco De Martino --- tests/integration-tests/configs/common/common.yaml | 6 +++++- .../test_fsx_lustre/test_fsx_lustre/pcluster.config.ini | 4 ++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/tests/integration-tests/configs/common/common.yaml b/tests/integration-tests/configs/common/common.yaml index d1795abc65..ba0afe616b 100644 --- a/tests/integration-tests/configs/common/common.yaml +++ b/tests/integration-tests/configs/common/common.yaml @@ -389,6 +389,10 @@ storage: instances: {{ common.INSTANCES_DEFAULT_X86 }} oss: ["alinux"] schedulers: ["slurm"] + - regions: ["us-gov-west-1"] + instances: {{ common.INSTANCES_DEFAULT_X86 }} + oss: ["alinux2"] + schedulers: ["slurm"] test_fsx_lustre.py::test_fsx_lustre_configuration_options: dimensions: - regions: ["us-east-2"] @@ -557,4 +561,4 @@ resource_bucket: - regions: ["ap-southeast-1"] instances: {{ common.INSTANCES_DEFAULT_X86 }} oss: ["alinux2"] - schedulers: ["slurm", "awsbatch"] \ No newline at end of file + schedulers: ["slurm", "awsbatch"] diff --git a/tests/integration-tests/tests/storage/test_fsx_lustre/test_fsx_lustre/pcluster.config.ini b/tests/integration-tests/tests/storage/test_fsx_lustre/test_fsx_lustre/pcluster.config.ini index 04a3a17142..c92c5dd9af 100644 --- a/tests/integration-tests/tests/storage/test_fsx_lustre/test_fsx_lustre/pcluster.config.ini +++ b/tests/integration-tests/tests/storage/test_fsx_lustre/test_fsx_lustre/pcluster.config.ini @@ -38,8 +38,8 @@ shared_dir = {{ mount_dir }} storage_capacity = {{ storage_capacity }} import_path = s3://{{ bucket_name }} export_path = s3://{{ bucket_name }}/export_dir -{% if region.startswith("cn-") %} -# the only deployment_type supported in China regions is PERSISTENT_1 +{% if region.startswith(("cn-", "us-gov-")) %} +# SCRATCH_1 not available in China/GovCloud regions deployment_type = PERSISTENT_1 per_unit_storage_throughput = 200 {% endif %} From 9db9ac23c533e44b4245a3c78b202767920a52ae Mon Sep 17 00:00:00 2001 From: Francesco De Martino Date: Thu, 17 Dec 2020 17:55:55 +0100 Subject: [PATCH 51/66] Fix formatting in CFN template Signed-off-by: Francesco De Martino --- cloudformation/raid-substack.cfn.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cloudformation/raid-substack.cfn.json b/cloudformation/raid-substack.cfn.json index 8fc1286b2d..148eca9629 100644 --- a/cloudformation/raid-substack.cfn.json +++ b/cloudformation/raid-substack.cfn.json @@ -858,7 +858,7 @@ "Description": "Availability Zone the cluster will launch into. THIS IS REQUIRED", "Type": "AWS::EC2::AvailabilityZone::Name" }, - "RAIDOptions":{ + "RAIDOptions": { "Description": "Comma separated list of RAID related options, 9 parameters in total, [0 shared_dir,1 raid_type,2 num_of_vols,3 vol_type,4 vol_size,5 vol_IOPS,6 encrypted, 7 ebs_kms_key, 8 volume_throughput]", "Type": "CommaDelimitedList" } From df73a350cf242fda36d84ce45141975e8c701b92 Mon Sep 17 00:00:00 2001 From: Francesco De Martino Date: Thu, 17 Dec 2020 19:10:49 +0100 Subject: [PATCH 52/66] Restore scaling tests in develop Signed-off-by: Francesco De Martino --- tests/integration-tests/configs/develop.yaml | 32 ++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/tests/integration-tests/configs/develop.yaml b/tests/integration-tests/configs/develop.yaml index 0eb3fc82b1..77a9dd7f4e 100644 --- a/tests/integration-tests/configs/develop.yaml +++ b/tests/integration-tests/configs/develop.yaml @@ -26,6 +26,38 @@ test-suites: schedulers: ["{{ scheduler }}"] {%- endfor %} scaling: + test_scaling.py::test_hit_scaling: + dimensions: + - regions: ["us-west-1"] + instances: {{ common.INSTANCES_DEFAULT_X86 }} + oss: {{ common.OSS_ONE_PER_DISTRO }} + schedulers: ["slurm"] + test_scaling.py::test_nodewatcher_terminates_failing_node: + dimensions: + - regions: ["sa-east-1"] + instances: {{ common.INSTANCES_DEFAULT_X86 }} + oss: {{ common.OSS_ONE_PER_DISTRO }} + schedulers: ["sge", "torque"] + test_mpi.py::test_mpi: # TODO: move outside of the scaling dir + dimensions: + - regions: ["ap-east-1"] + instances: {{ common.INSTANCES_DEFAULT_X86 }} + oss: {{ common.OSS_COMMERCIAL_X86 }} + schedulers: ["slurm", "sge"] + - regions: ["us-east-1"] + instances: {{ common.INSTANCES_DEFAULT_ARM }} + oss: {{ common.OSS_COMMERCIAL_ARM }} + schedulers: ["slurm", "sge"] + test_mpi.py::test_mpi_ssh: + dimensions: + - regions: ["eu-north-1"] + instances: {{ common.INSTANCES_DEFAULT_X86 }} + oss: {{ common.OSS_COMMERCIAL_X86 }} + schedulers: ["slurm"] + - regions: ["ca-central-1"] + instances: {{ common.INSTANCES_DEFAULT_X86 }} + oss: {{ common.OSS_COMMERCIAL_X86 }} + schedulers: ["sge"] test_scaling.py::test_multiple_jobs_submission: dimensions: - regions: {{ common.REGIONS_COMMERCIAL }} From e90fad56b14a2ea0a450959fda93619e2706fb3e Mon Sep 17 00:00:00 2001 From: ddeidda Date: Fri, 18 Dec 2020 10:45:21 +0100 Subject: [PATCH 53/66] Relax EFA interface check in integration tests The final number returned from `lspci -n` can be different from 0. Signed-off-by: ddeidda --- tests/integration-tests/tests/efa/test_efa.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/integration-tests/tests/efa/test_efa.py b/tests/integration-tests/tests/efa/test_efa.py index bece1e555b..5de729597d 100644 --- a/tests/integration-tests/tests/efa/test_efa.py +++ b/tests/integration-tests/tests/efa/test_efa.py @@ -111,13 +111,13 @@ def _test_efa_installation(scheduler_commands, remote_command_executor, efa_inst # Check if EFA interface is on compute node result = remote_command_executor.run_remote_command("cat /shared/lspci.out") if efa_installed: - assert_that(result.stdout).contains("1d0f:efa0") + assert_that(result.stdout).contains("1d0f:efa") else: - assert_that(result.stdout).does_not_contain("1d0f:efa0") + assert_that(result.stdout).does_not_contain("1d0f:efa") # Check EFA interface not present on head node result = remote_command_executor.run_remote_command("lspci -n") - assert_that(result.stdout).does_not_contain("1d0f:efa0") + assert_that(result.stdout).does_not_contain("1d0f:efa") def _test_osu_benchmarks( From ddf0a8d049a0fabacb759ec5ff7f5d8664c172a1 Mon Sep 17 00:00:00 2001 From: ddeidda Date: Fri, 18 Dec 2020 12:30:32 +0100 Subject: [PATCH 54/66] Re-enable master instance type on EFA tests P4d is now supported also as head node. Signed-off-by: ddeidda --- .../tests/efa/test_efa/test_hit_efa/pcluster.config.ini | 2 +- .../tests/efa/test_efa/test_sit_efa/pcluster.config.ini | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/integration-tests/tests/efa/test_efa/test_hit_efa/pcluster.config.ini b/tests/integration-tests/tests/efa/test_efa/test_hit_efa/pcluster.config.ini index e856c358cd..bdba3da944 100644 --- a/tests/integration-tests/tests/efa/test_efa/test_hit_efa/pcluster.config.ini +++ b/tests/integration-tests/tests/efa/test_efa/test_hit_efa/pcluster.config.ini @@ -9,7 +9,7 @@ base_os = {{ os }} key_name = {{ key_name }} vpc_settings = parallelcluster-vpc scheduler = {{ scheduler }} -master_instance_type = c5.xlarge +master_instance_type = {{ instance }} queue_settings = efa-enabled,efa-disabled [queue efa-enabled] diff --git a/tests/integration-tests/tests/efa/test_efa/test_sit_efa/pcluster.config.ini b/tests/integration-tests/tests/efa/test_efa/test_sit_efa/pcluster.config.ini index 865603075f..7123ce919a 100644 --- a/tests/integration-tests/tests/efa/test_efa/test_sit_efa/pcluster.config.ini +++ b/tests/integration-tests/tests/efa/test_efa/test_sit_efa/pcluster.config.ini @@ -9,7 +9,7 @@ base_os = {{ os }} key_name = {{ key_name }} vpc_settings = parallelcluster-vpc scheduler = {{ scheduler }} -master_instance_type = c5.xlarge +master_instance_type = {{ instance }} compute_instance_type = {{ instance }} initial_queue_size = 2 maintain_initial_size = true From c0845f0344b93d6d69db822ded3bbba2363340fe Mon Sep 17 00:00:00 2001 From: Francesco De Martino Date: Fri, 18 Dec 2020 14:26:11 +0100 Subject: [PATCH 55/66] Enable EFA tests for c6gn.16xlarge Signed-off-by: Francesco De Martino --- .../configs/common/common.yaml | 4 ++++ tests/integration-tests/tests/efa/test_efa.py | 24 ++++++++++--------- 2 files changed, 17 insertions(+), 11 deletions(-) diff --git a/tests/integration-tests/configs/common/common.yaml b/tests/integration-tests/configs/common/common.yaml index ba0afe616b..9273480ab4 100644 --- a/tests/integration-tests/configs/common/common.yaml +++ b/tests/integration-tests/configs/common/common.yaml @@ -207,6 +207,10 @@ efa: instances: ["p4d.24xlarge"] oss: ["alinux2", "ubuntu1604", "centos8"] schedulers: ["slurm"] + - regions: ["us-west-2"] + instances: ["c6gn.16xlarge"] + oss: ["alinux2", "ubuntu1604"] + schedulers: ["slurm"] test_efa.py::test_sit_efa: dimensions: - regions: ["us-east-1"] diff --git a/tests/integration-tests/tests/efa/test_efa.py b/tests/integration-tests/tests/efa/test_efa.py index 5de729597d..32ecec533d 100644 --- a/tests/integration-tests/tests/efa/test_efa.py +++ b/tests/integration-tests/tests/efa/test_efa.py @@ -29,7 +29,7 @@ # Slurm test is to verify EFA works correctly when using the SIT model in the config file @pytest.mark.schedulers(["sge", "slurm"]) @pytest.mark.usefixtures("os") -def test_sit_efa(region, scheduler, instance, pcluster_config_reader, clusters_factory, test_datadir): +def test_sit_efa(region, scheduler, instance, pcluster_config_reader, clusters_factory, test_datadir, architecture): """ Test all EFA Features. @@ -46,7 +46,8 @@ def test_sit_efa(region, scheduler, instance, pcluster_config_reader, clusters_f _test_mpi(remote_command_executor, slots_per_instance, scheduler) logging.info("Running on Instances: {0}".format(get_compute_nodes_instance_ids(cluster.cfn_name, region))) _test_osu_benchmarks("openmpi", remote_command_executor, scheduler_commands, test_datadir, slots_per_instance) - _test_osu_benchmarks("intelmpi", remote_command_executor, scheduler_commands, test_datadir, slots_per_instance) + if architecture == "x86_64": + _test_osu_benchmarks("intelmpi", remote_command_executor, scheduler_commands, test_datadir, slots_per_instance) _test_shm_transfer_is_enabled(scheduler_commands, remote_command_executor) assert_no_errors_in_logs(remote_command_executor, scheduler) @@ -57,7 +58,7 @@ def test_sit_efa(region, scheduler, instance, pcluster_config_reader, clusters_f @pytest.mark.oss(["alinux2"]) @pytest.mark.schedulers(["slurm"]) @pytest.mark.usefixtures("os") -def test_hit_efa(region, scheduler, instance, pcluster_config_reader, clusters_factory, test_datadir): +def test_hit_efa(region, scheduler, instance, pcluster_config_reader, clusters_factory, test_datadir, architecture): """ Test all EFA Features. @@ -82,14 +83,15 @@ def test_hit_efa(region, scheduler, instance, pcluster_config_reader, clusters_f slots_per_instance, partition="efa-enabled", ) - _test_osu_benchmarks( - "intelmpi", - remote_command_executor, - scheduler_commands, - test_datadir, - slots_per_instance, - partition="efa-enabled", - ) + if architecture == "x86_64": + _test_osu_benchmarks( + "intelmpi", + remote_command_executor, + scheduler_commands, + test_datadir, + slots_per_instance, + partition="efa-enabled", + ) _test_shm_transfer_is_enabled(scheduler_commands, remote_command_executor, partition="efa-enabled") assert_no_errors_in_logs(remote_command_executor, scheduler) From aae4aa5b2c1e19e8313a0bd6b76994b133a610d5 Mon Sep 17 00:00:00 2001 From: ddeidda Date: Fri, 18 Dec 2020 16:38:30 +0100 Subject: [PATCH 56/66] Prevent configuration of non NVIDIA GPUs in compute resources GPUs from manufacturers different from NVIDIA (ex. AMD) are currently not supported in ParallelCluster. With this patch we introduce a warning message that will be printed when GPUs from a manufacturer different from NVIDIA are detected, and we prevent them from being set in compute resurces. Signed-off-by: ddeidda --- cli/src/pcluster/utils.py | 18 +++++++++++++++--- cli/tests/pcluster/test_utils.py | 28 +++++++++++++++++++++++++++- 2 files changed, 42 insertions(+), 4 deletions(-) diff --git a/cli/src/pcluster/utils.py b/cli/src/pcluster/utils.py index 00844e2fb2..d7844d509f 100644 --- a/cli/src/pcluster/utils.py +++ b/cli/src/pcluster/utils.py @@ -1270,9 +1270,21 @@ def init_from_instance_type(instance_type, exit_on_error=True): def gpu_count(self): """Return the number of GPUs for the instance.""" gpu_info = self.instance_type_data.get("GpuInfo", None) - # Currently adding up all gpus. To be reviewed if the case of heterogeneous GPUs arises. - gpus = sum([gpus.get("Count") for gpus in gpu_info.get("Gpus")]) if gpu_info else 0 - return gpus + + gpu_count = 0 + if gpu_info: + for gpus in gpu_info.get("Gpus", []): + gpu_manufacturer = gpus.get("Manufacturer", "") + if gpu_manufacturer.upper() == "NVIDIA": + gpu_count += gpus.get("Count", 0) + else: + warn( + "ParallelCluster currently does not offer native support for '{0}' GPUs. " + "Please make sure to use a custom AMI with the appropriate drivers in order to leverage " + "GPUs functionalities".format(gpu_manufacturer) + ) + + return gpu_count def max_network_interface_count(self): """Max number of NICs for the instance.""" diff --git a/cli/tests/pcluster/test_utils.py b/cli/tests/pcluster/test_utils.py index 6195489b77..ab0081633a 100644 --- a/cli/tests/pcluster/test_utils.py +++ b/cli/tests/pcluster/test_utils.py @@ -1034,7 +1034,7 @@ class TestInstanceTypeInfo: def clear_cache(self): utils.Cache.clear_all() - def test_init_from_instance_type(self, boto3_stubber): + def test_init_from_instance_type(self, boto3_stubber, capsys): mocked_requests = [ MockedBoto3Request( method="describe_instance_types", @@ -1065,14 +1065,31 @@ def test_init_from_instance_type(self, boto3_stubber): }, expected_params={"InstanceTypes": ["g4dn.metal"]}, ), + MockedBoto3Request( + method="describe_instance_types", + response={ + "InstanceTypes": [ + { + "InstanceType": "g4ad.16xlarge", + "VCpuInfo": {"DefaultVCpus": 64}, + "GpuInfo": {"Gpus": [{"Name": "*", "Manufacturer": "AMD", "Count": 4}]}, + "NetworkInfo": {"EfaSupported": False, "MaximumNetworkCards": 1}, + "ProcessorInfo": {"SupportedArchitectures": ["x86_64"]}, + } + ] + }, + expected_params={"InstanceTypes": ["g4ad.16xlarge"]}, + ), ] boto3_stubber("ec2", mocked_requests) for _ in range(0, 2): c4_instance_info = utils.InstanceTypeInfo.init_from_instance_type("c4.xlarge") g4dn_instance_info = utils.InstanceTypeInfo.init_from_instance_type("g4dn.metal") + g4ad_instance_info = utils.InstanceTypeInfo.init_from_instance_type("g4ad.16xlarge") assert_that(c4_instance_info.gpu_count()).is_equal_to(0) + assert_that(capsys.readouterr().out).is_empty() assert_that(c4_instance_info.max_network_interface_count()).is_equal_to(1) assert_that(c4_instance_info.default_threads_per_core()).is_equal_to(2) assert_that(c4_instance_info.vcpus_count()).is_equal_to(4) @@ -1080,12 +1097,21 @@ def test_init_from_instance_type(self, boto3_stubber): assert_that(c4_instance_info.is_efa_supported()).is_equal_to(False) assert_that(g4dn_instance_info.gpu_count()).is_equal_to(8) + assert_that(capsys.readouterr().out).is_empty() assert_that(g4dn_instance_info.max_network_interface_count()).is_equal_to(4) assert_that(g4dn_instance_info.default_threads_per_core()).is_equal_to(2) assert_that(g4dn_instance_info.vcpus_count()).is_equal_to(96) assert_that(g4dn_instance_info.supported_architecture()).is_equal_to(["x86_64"]) assert_that(g4dn_instance_info.is_efa_supported()).is_equal_to(True) + assert_that(g4ad_instance_info.gpu_count()).is_equal_to(0) + assert_that(capsys.readouterr().out).matches("not offer native support for 'AMD' GPUs.") + assert_that(g4ad_instance_info.max_network_interface_count()).is_equal_to(1) + assert_that(g4ad_instance_info.default_threads_per_core()).is_equal_to(2) + assert_that(g4ad_instance_info.vcpus_count()).is_equal_to(64) + assert_that(g4ad_instance_info.supported_architecture()).is_equal_to(["x86_64"]) + assert_that(g4ad_instance_info.is_efa_supported()).is_equal_to(False) + def test_init_from_instance_type_failure(self, boto3_stubber): boto3_stubber( "ec2", From 68e43479ba119380ed6384552822586864368a26 Mon Sep 17 00:00:00 2001 From: Rex Date: Sun, 20 Dec 2020 16:11:11 -0700 Subject: [PATCH 57/66] Run c6gn tests with Ubuntu18 Signed-off-by: Rex --- tests/integration-tests/configs/common/common.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration-tests/configs/common/common.yaml b/tests/integration-tests/configs/common/common.yaml index 9273480ab4..46d251235a 100644 --- a/tests/integration-tests/configs/common/common.yaml +++ b/tests/integration-tests/configs/common/common.yaml @@ -209,7 +209,7 @@ efa: schedulers: ["slurm"] - regions: ["us-west-2"] instances: ["c6gn.16xlarge"] - oss: ["alinux2", "ubuntu1604"] + oss: ["alinux2", "ubuntu1804"] schedulers: ["slurm"] test_efa.py::test_sit_efa: dimensions: From 627d835ec320841397b625afa9451030501de223 Mon Sep 17 00:00:00 2001 From: ddeidda Date: Mon, 21 Dec 2020 10:09:55 +0100 Subject: [PATCH 58/66] Set use_public_ips=true in EFA tests When P4d instances are used as head node, the parameter use_public_ips must be set to true in order for the public IP to be assigned to the instance. Signed-off-by: ddeidda --- .../tests/efa/test_efa/test_hit_efa/pcluster.config.ini | 2 +- .../tests/efa/test_efa/test_sit_efa/pcluster.config.ini | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/integration-tests/tests/efa/test_efa/test_hit_efa/pcluster.config.ini b/tests/integration-tests/tests/efa/test_efa/test_hit_efa/pcluster.config.ini index bdba3da944..fe9a7803e4 100644 --- a/tests/integration-tests/tests/efa/test_efa/test_hit_efa/pcluster.config.ini +++ b/tests/integration-tests/tests/efa/test_efa/test_hit_efa/pcluster.config.ini @@ -39,4 +39,4 @@ max_count = {{ max_queue_size }} vpc_id = {{ vpc_id }} master_subnet_id = {{ public_subnet_id }} compute_subnet_id = {{ private_subnet_id }} -use_public_ips = false +use_public_ips = true diff --git a/tests/integration-tests/tests/efa/test_efa/test_sit_efa/pcluster.config.ini b/tests/integration-tests/tests/efa/test_efa/test_sit_efa/pcluster.config.ini index 7123ce919a..4ef406cf1d 100644 --- a/tests/integration-tests/tests/efa/test_efa/test_sit_efa/pcluster.config.ini +++ b/tests/integration-tests/tests/efa/test_efa/test_sit_efa/pcluster.config.ini @@ -24,4 +24,4 @@ placement_group = DYNAMIC vpc_id = {{ vpc_id }} master_subnet_id = {{ public_subnet_id }} compute_subnet_id = {{ private_subnet_id }} -use_public_ips = false +use_public_ips = true From bb5347a73b1830e0e112b76a2e521ae12891611e Mon Sep 17 00:00:00 2001 From: Francesco De Martino Date: Mon, 21 Dec 2020 11:44:18 +0100 Subject: [PATCH 59/66] Migrate Travis workflow to GitHub actions Signed-off-by: Francesco De Martino --- .github/workflows/ci.yml | 74 ++++++++++++++++++++++++++++++++++++++++ .travis.yml | 45 ------------------------ cli/tox.ini | 2 +- 3 files changed, 75 insertions(+), 46 deletions(-) create mode 100644 .github/workflows/ci.yml delete mode 100644 .travis.yml diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000000..a40d6d9c1a --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,74 @@ +name: ParallelCluster CI + +on: [push] + +jobs: + build: + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + os: [ubuntu-latest] + name: + - Python 2.7 Tests + - Python 3.4 Tests + - Python 3.5 Tests + - Python 3.6 Tests + - Python 3.7 Tests + - Python 3.8 Tests + - Python 3.9 Tests + - Python 3.8 Tests Coverage + - Code Checks + - CloudFormation Templates Checks + include: + - name: Python 2.7 Tests + python: 2.7 + toxdir: cli + toxenv: py27-nocov + - name: Python 3.4 Tests + python: 3.4 + toxdir: cli + toxenv: py34-nocov + - name: Python 3.5 Tests + python: 3.5 + toxdir: cli + toxenv: py35-nocov + - name: Python 3.6 Tests + python: 3.6 + toxdir: cli + toxenv: py36-nocov + - name: Python 3.7 Tests + python: 3.7 + toxdir: cli + toxenv: py37-nocov + - name: Python 3.8 Tests + python: 3.8 + toxdir: cli + toxenv: py38-nocov + - name: Python 3.9 Tests + python: 3.9 + toxdir: cli + toxenv: py39-nocov + - name: Python 3.8 Tests Coverage + python: 3.8 + toxdir: cli + toxenv: py38-cov + - name: Code Checks + python: 3.6 + toxdir: cli + toxenv: code-linters + - name: CloudFormation Templates Checks + python: 3.6 + toxdir: tests/integration-tests + toxenv: cfn-format-check,cfn-lint,cfn-tests + + steps: + - uses: actions/checkout@v2 + - name: Setup Python + uses: actions/setup-python@v2 + with: + python-version: ${{ matrix.python }} + - name: Install Tox and any other packages + run: pip install tox + - name: Run Tox + run: cd ${{ matrix.toxdir }} && tox -e ${{ matrix.toxenv }} diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index b9130aab4a..0000000000 --- a/.travis.yml +++ /dev/null @@ -1,45 +0,0 @@ -language: python -sudo: required -dist: xenial - -matrix: - include: - - name: Python 2.7 Tests - python: 2.7 - env: TOXENV=py27-nocov - - name: Python 3.4 Tests - python: 3.4 - env: TOXENV=py34-nocov - - name: Python 3.5 Tests - python: 3.5 - env: TOXENV=py35-nocov - - name: Python 3.6 Tests - python: 3.6 - env: TOXENV=py36-nocov - - name: Python 3.7 Tests - python: 3.7 - env: TOXENV=py37-nocov - - name: Python 3.8 Tests - python: 3.8 - env: TOXENV=py38-nocov - - name: Python 3.9 Tests - python: 3.9 - env: TOXENV=py39-nocov - - name: Python 3.8 Tests Coverage - python: 3.8 - env: TOXENV=py38-cov - - name: Code Checks - python: 3.6 - env: TOXENV=code-linters - - name: CloudFormation Templates Checks - python: 3.6 - env: TOXENV=cfn-format-check,cfn-lint,cfn-tests - - name: Validate integration tests configs - python: 3.6 - env: TOXENV=validate-test-configs - script: cd tests/integration-tests && tox - -install: - - pip install tox-travis - -script: cd cli && tox diff --git a/cli/tox.ini b/cli/tox.ini index 061f9f140d..1667cb02cd 100644 --- a/cli/tox.ini +++ b/cli/tox.ini @@ -8,7 +8,7 @@ envlist = # Default testenv. Used to run tests on all python versions. [testenv] passenv = - CI TRAVIS_BUILD_ID TRAVIS TRAVIS_BRANCH TRAVIS_JOB_NUMBER TRAVIS_PULL_REQUEST TRAVIS_JOB_ID TRAVIS_REPO_SLUG TRAVIS_COMMIT + CI GITHUB_* usedevelop = cov: true nocov: false From 347fabbddb10b28a80ba7b02324cf4d187cf7914 Mon Sep 17 00:00:00 2001 From: Francesco De Martino Date: Mon, 21 Dec 2020 14:18:16 +0100 Subject: [PATCH 60/66] Enable CI GitHub action to run on pull_request Signed-off-by: Francesco De Martino --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index a40d6d9c1a..87dbe03dcc 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -1,6 +1,6 @@ name: ParallelCluster CI -on: [push] +on: [push, pull_request] jobs: build: From b3918cbbc567afa41c3d381898ec050dc2b47dd7 Mon Sep 17 00:00:00 2001 From: Francesco De Martino Date: Mon, 21 Dec 2020 13:02:09 +0100 Subject: [PATCH 61/66] Update changelog for 2.10.1 Signed-off-by: Francesco De Martino --- CHANGELOG.md | 36 +++++++++++++++++++++++++----------- 1 file changed, 25 insertions(+), 11 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index feeec67117..8ae5c8927d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,35 +10,49 @@ CHANGELOG - At the time of this version launch: - AWS Lustre and ARM instance type are not supported in me-south-1, af-south-1 and eu-south-1 - AWS Batch is not supported in af-south-1 - - EBS io2 is not supported in af-south-1 and eu-south-1 -- Remove CloudFormation DescribeStacks API call from AWS Batch Docker entrypoint. This removes the possibility of job - failures due to CloudFormation throttling. -- Install EFA kernel module also on ARM instances with `alinux2` and `ubuntu1804` + - EBS io2 is not supported in af-south-1 and eu-south-1 +- Install Arm Performance Libraries (APL) 20.2.1 on ARM AMIs (CentOS8, Alinux2, Ubuntu1804). +- Install EFA kernel module on ARM instances with `alinux2` and `ubuntu1804`. This enables support for `c6gn` instances. +- Add support for io2 and gp3 EBS volume type. - Add `iam_lambda_role` parameter under `cluster` section to enable the possibility to specify an existing IAM role to be used by AWS Lambda functions in CloudFormation. When using `sge`, `torque`, or `slurm` as the scheduler, `pcluster` will not create any IAM role if both `ec2_iam_role` and `iam_lambda_role` are provided. -- Add support for io2 and gp3 EBS volume type. +- Improve robustness of a Slurm cluster when clustermgtd is down. +- Configure NFS threads to be max(8, num_cores) for performance. This enhancement will not take effect on Ubuntu 16.04. +- Optimize calls to DescribeInstanceTypes EC2 API when validating cluster configuration. **CHANGES** -- Pull Amazon Linux Docker images from ECR when building docker image for `awsbatch` scheduler. -- Use inclusive language in user facing messages and internal naming convention. -- Change the default of instance types from the hardcoded `t2.micro` to the free tier instance type - (`t2.micro` or `t3.micro` dependent on region). In regions without free tier, the default is `t3.micro`. -- Enable support for p4d as head node instance type. (p4d was already supported as compute node in 2.10.0) -- Upgrade EFA installer to version 1.11.0 +- Upgrade EFA installer to version 1.11.0. - EFA configuration: ``efa-config-1.6`` (from efa-config-1.5) - EFA profile: ``efa-profile-1.2`` (from efa-profile-1.1) - EFA kernel module: ``efa-1.10.2`` (no change) - RDMA core: ``rdma-core-31.2amzn`` (from rdma-core-31.amzn0) - Libfabric: ``libfabric-1.11.1amzn1.0`` (from libfabric-1.11.1amzn1.1) - Open MPI: ``openmpi40-aws-4.0.5`` (no change) +- Upgrade Intel MPI to version U8. +- Upgrade NICE DCV to version 2020.2-9662. +- Set default systemd runlevel to multi-user.target on all OSes during ParallelCluster official AMI creation. + The runlevel is set to graphical.target on head node only when DCV is enabled. This prevents the execution of + graphical services, such as x/gdm, when they are not required. +- Download Intel MPI and HPC packages from S3 rather than Intel yum repos. +- Change the default of instance types from the hardcoded `t2.micro` to the free tier instance type + (`t2.micro` or `t3.micro` dependent on region). In regions without free tier, the default is `t3.micro`. +- Enable support for p4d as head node instance type (p4d was already supported as compute node in 2.10.0). +- Pull Amazon Linux Docker images from public ECR when building docker image for `awsbatch` scheduler. +- Increase max retry attempts when registering Slurm nodes in Route53. **BUG FIXES** +- Fix pcluster createami for Ubuntu 1804 by downloading SGE sources from Debian repository and not from the EOL + Ubuntu 19.10. +- Remove CloudFormation DescribeStacks API call from AWS Batch Docker entrypoint. This removes the risk of job + failures due to CloudFormation throttling. - Mandate the presence of `vpc_settings`, `vpc_id`, `master_subnet_id` in the config file to avoid unhandled exceptions. - Set the default EBS volume size to 500 GiB when volume type is `st1` or `sc1`. +- Fix installation of Intel PSXE package on CentOS 7 by using yum4. +- Fix routing issues with multiple Network Interfaces on Ubuntu 18.04. 2.10.0 ------ From 337a83c24f4ed954e22cab14252ceb55f044f5c2 Mon Sep 17 00:00:00 2001 From: Francesco De Martino Date: Mon, 21 Dec 2020 16:14:03 +0100 Subject: [PATCH 62/66] Update changelog for 2.10.1 Signed-off-by: Francesco De Martino --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8ae5c8927d..385271fc4c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,7 +8,7 @@ CHANGELOG - Add support for me-south-1 region (Bahrein), af-south-1 region (Cape Town) and eu-south-1 region (Milan) - At the time of this version launch: - - AWS Lustre and ARM instance type are not supported in me-south-1, af-south-1 and eu-south-1 + - Amazon FSx for Lustre and ARM instance types are not supported in me-south-1, af-south-1 and eu-south-1 - AWS Batch is not supported in af-south-1 - EBS io2 is not supported in af-south-1 and eu-south-1 - Install Arm Performance Libraries (APL) 20.2.1 on ARM AMIs (CentOS8, Alinux2, Ubuntu1804). From c9e279efc2229421cc98295734bd0a2d108ba7de Mon Sep 17 00:00:00 2001 From: Tim Lane Date: Mon, 21 Dec 2020 13:05:22 -0800 Subject: [PATCH 63/66] Run CFN linter github action in CLI dir Signed-off-by: Tim Lane --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 87dbe03dcc..e0464dd62c 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -59,7 +59,7 @@ jobs: toxenv: code-linters - name: CloudFormation Templates Checks python: 3.6 - toxdir: tests/integration-tests + toxdir: cli toxenv: cfn-format-check,cfn-lint,cfn-tests steps: From cd7c2fdc1f0a61f099b21e4f89f1abb45ded5bac Mon Sep 17 00:00:00 2001 From: chenwany Date: Mon, 21 Dec 2020 08:51:04 -0800 Subject: [PATCH 64/66] Change validator to unblock io2 Block Express Volume Modify the iops and size range ro unblock user create io2 Block Express volume Signed-off-by: chenwany --- cli/src/pcluster/config/validators.py | 6 +++--- cli/tests/pcluster/config/test_validators.py | 18 +++++++++--------- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/cli/src/pcluster/config/validators.py b/cli/src/pcluster/config/validators.py index e1b46ea143..be7a235171 100644 --- a/cli/src/pcluster/config/validators.py +++ b/cli/src/pcluster/config/validators.py @@ -76,7 +76,7 @@ EBS_VOLUME_TYPE_TO_VOLUME_SIZE_BOUNDS = { "standard": (1, 1024), "io1": (4, 16 * 1024), - "io2": (4, 16 * 1024), + "io2": (4, 64 * 1024), "gp2": (1, 16 * 1024), "gp3": (1, 16 * 1024), "st1": (500, 16 * 1024), @@ -85,7 +85,7 @@ EBS_VOLUME_IOPS_BOUNDS = { "io1": (100, 64000), - "io2": (100, 64000), + "io2": (100, 256000), "gp3": (3000, 16000), } @@ -1360,7 +1360,7 @@ def ebs_volume_iops_validator(section_key, section_label, pcluster_config): section = pcluster_config.get_section(section_key, section_label) volume_size = section.get_param_value("volume_size") volume_type = section.get_param_value("volume_type") - volume_type_to_iops_ratio = {"io1": 50, "io2": 500, "gp3": 500} + volume_type_to_iops_ratio = {"io1": 50, "io2": 1000, "gp3": 500} volume_iops = section.get_param_value("volume_iops") if volume_type in EBS_VOLUME_IOPS_BOUNDS: diff --git a/cli/tests/pcluster/config/test_validators.py b/cli/tests/pcluster/config/test_validators.py index e87b5b8ef3..1de759b4ef 100644 --- a/cli/tests/pcluster/config/test_validators.py +++ b/cli/tests/pcluster/config/test_validators.py @@ -698,14 +698,14 @@ def test_efs_validator(mocker, section_dict, expected_message): ({"volume_type": "io2", "volume_size": 20, "volume_iops": 120}, None), ( {"volume_type": "io2", "volume_size": 20, "volume_iops": 90}, - "IOPS rate must be between 100 and 64000 when provisioning io2 volumes.", + "IOPS rate must be between 100 and 256000 when provisioning io2 volumes.", ), ( - {"volume_type": "io2", "volume_size": 20, "volume_iops": 64001}, - "IOPS rate must be between 100 and 64000 when provisioning io2 volumes.", + {"volume_type": "io2", "volume_size": 20, "volume_iops": 256001}, + "IOPS rate must be between 100 and 256000 when provisioning io2 volumes.", ), ( - {"volume_type": "io2", "volume_size": 20, "volume_iops": 10001}, + {"volume_type": "io2", "volume_size": 20, "volume_iops": 20001}, "IOPS to volume size ratio of .* is too high", ), ({"volume_type": "gp3", "volume_size": 20, "volume_iops": 3000}, None), @@ -2449,7 +2449,7 @@ def test_fsx_ignored_parameters_validator(mocker, section_dict, expected_error): ({"volume_type": "io1", "volume_size": 16385}, "The size of io1 volumes can not exceed 16384 GiB"), ({"volume_type": "io2", "volume_size": 15}, None), ({"volume_type": "io2", "volume_size": 3}, "The size of io2 volumes must be at least 4 GiB"), - ({"volume_type": "io2", "volume_size": 16385}, "The size of io2 volumes can not exceed 16384 GiB"), + ({"volume_type": "io2", "volume_size": 65537}, "The size of io2 volumes can not exceed 65536 GiB"), ({"volume_type": "gp2", "volume_size": 15}, None), ({"volume_type": "gp2", "volume_size": 0}, "The size of gp2 volumes must be at least 1 GiB"), ({"volume_type": "gp2", "volume_size": 16385}, "The size of gp2 volumes can not exceed 16384 GiB"), @@ -2493,14 +2493,14 @@ def test_ebs_allowed_values_all_have_volume_size_bounds(): ({"volume_type": "io2", "volume_size": 20, "volume_iops": 120}, None), ( {"volume_type": "io2", "volume_size": 20, "volume_iops": 90}, - "IOPS rate must be between 100 and 64000 when provisioning io2 volumes.", + "IOPS rate must be between 100 and 256000 when provisioning io2 volumes.", ), ( - {"volume_type": "io2", "volume_size": 20, "volume_iops": 64001}, - "IOPS rate must be between 100 and 64000 when provisioning io2 volumes.", + {"volume_type": "io2", "volume_size": 20, "volume_iops": 256001}, + "IOPS rate must be between 100 and 256000 when provisioning io2 volumes.", ), ( - {"volume_type": "io2", "volume_size": 20, "volume_iops": 10001}, + {"volume_type": "io2", "volume_size": 20, "volume_iops": 20001}, "IOPS to volume size ratio of .* is too high", ), ({"volume_type": "gp3", "volume_size": 20, "volume_iops": 3000}, None), From 9be10de123c0e6535ca4b53b861f7c8f81e43b79 Mon Sep 17 00:00:00 2001 From: Luca Carrogu Date: Tue, 22 Dec 2020 13:08:53 +0100 Subject: [PATCH 65/66] Upgrade EFA installer to version 1.11.1 Changelog ``` - EFA configuration: ``efa-config-1.7`` (from efa-config-1.5) - EFA profile: ``efa-profile-1.3`` (from efa-profile-1.1) - EFA kernel module: ``efa-1.10.2`` (no change) - RDMA core: ``rdma-core-31.2amzn`` (from rdma-core-31.amzn0) - Libfabric: ``libfabric-1.11.1amzn1.0`` (from libfabric-1.11.1amzn1.1) - Open MPI: ``openmpi40-aws-4.1.0`` (from openmpi40-aws-4.0.5) ``` Signed-off-by: Luca Carrogu --- CHANGELOG.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 385271fc4c..2ce782e751 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -24,13 +24,13 @@ CHANGELOG **CHANGES** -- Upgrade EFA installer to version 1.11.0. - - EFA configuration: ``efa-config-1.6`` (from efa-config-1.5) - - EFA profile: ``efa-profile-1.2`` (from efa-profile-1.1) +- Upgrade EFA installer to version 1.11.1. + - EFA configuration: ``efa-config-1.7`` (from efa-config-1.5) + - EFA profile: ``efa-profile-1.3`` (from efa-profile-1.1) - EFA kernel module: ``efa-1.10.2`` (no change) - RDMA core: ``rdma-core-31.2amzn`` (from rdma-core-31.amzn0) - Libfabric: ``libfabric-1.11.1amzn1.0`` (from libfabric-1.11.1amzn1.1) - - Open MPI: ``openmpi40-aws-4.0.5`` (no change) + - Open MPI: ``openmpi40-aws-4.1.0`` (from openmpi40-aws-4.0.5) - Upgrade Intel MPI to version U8. - Upgrade NICE DCV to version 2020.2-9662. - Set default systemd runlevel to multi-user.target on all OSes during ParallelCluster official AMI creation. From ab8383cb4f664dd9d467ac70370b578a644d062f Mon Sep 17 00:00:00 2001 From: ParallelCluster AMI bot Date: Tue, 22 Dec 2020 19:05:45 +0000 Subject: [PATCH 66/66] Update AMI List Build Number 597 aws-parallelcluster-cookbook Git hash: d5378bb60f7810bb2f467e5ada9589cc8607ee2e aws-parallelcluster-node Git hash: ae7c4b123d18399361b85e31473ad9ee53b21e45 Signed-off-by: ParallelCluster AMI bot --- amis.txt | 374 ++++++++++--------- cloudformation/aws-parallelcluster.cfn.json | 386 +++++++++++--------- 2 files changed, 422 insertions(+), 338 deletions(-) diff --git a/amis.txt b/amis.txt index fb20f63162..1b557023fe 100644 --- a/amis.txt +++ b/amis.txt @@ -1,144 +1,163 @@ ## x86_64 # alinux -ap-east-1: ami-086d023ae9be29265 -ap-northeast-1: ami-071d1f9ecf81d5866 -ap-northeast-2: ami-0f13dd00403ae009c +af-south-1: ami-0f2e2135a05f814df +ap-east-1: ami-05ef7cb79d3a43092 +ap-northeast-1: ami-006b7f4c929aaf4a9 +ap-northeast-2: ami-06a49824cc501a981 ap-northeast-3: UNSUPPORTED -ap-south-1: ami-09952eb2d4dd3f2e2 -ap-southeast-1: ami-043d99842781aff9d -ap-southeast-2: ami-02656322bc3cae72c -ca-central-1: ami-0a853d47df00d434a -cn-north-1: ami-03a764b8e78792057 -cn-northwest-1: ami-077d4bb98d5ca5def -eu-central-1: ami-09f3ed2c18ba86996 -eu-north-1: ami-03d6705f566e99836 -eu-west-1: ami-02c2421d6cd745994 -eu-west-2: ami-0ef2ff4ce73c7208e -eu-west-3: ami-07e87c1e0dd3d3e75 -sa-east-1: ami-0d552ac237f838360 -us-east-1: ami-01a1cf6f36f2bd13b -us-east-2: ami-0a8d40acce6869be4 -us-gov-east-1: ami-05bb72e83f0973bdc -us-gov-west-1: ami-0318093bb66476048 -us-west-1: ami-0425c898b65b066f3 -us-west-2: ami-0ab937dbac92ae27e +ap-south-1: ami-0a47fd68cf7034c58 +ap-southeast-1: ami-0e6cfdde386164836 +ap-southeast-2: ami-0ba7f788162c4a4de +ca-central-1: ami-0808f2df200c7006c +cn-north-1: ami-050c2b7b0181fbfd5 +cn-northwest-1: ami-07d8dd2f175498353 +eu-central-1: ami-032358687770c43e1 +eu-north-1: ami-038781ae9b21b98ca +eu-south-1: ami-0a9a6c50dc32a934f +eu-west-1: ami-0dd799c2e1a68608e +eu-west-2: ami-0849a887182dd033a +eu-west-3: ami-060df914a5b5ad680 +me-south-1: ami-0d68c8e916ccf0418 +sa-east-1: ami-0d4a17532432d5aa2 +us-east-1: ami-0604e4a14869de93f +us-east-2: ami-00d4efc81188687a0 +us-gov-east-1: ami-01d57910cd71ea0c4 +us-gov-west-1: ami-0cfc4f4eb94c9f403 +us-west-1: ami-0c8decb747bfca25f +us-west-2: ami-018ccd7660ecade5e # alinux2 -ap-east-1: ami-060927fff43c77a88 -ap-northeast-1: ami-068b3b3104ae04d62 -ap-northeast-2: ami-07cd9137f04b28895 +af-south-1: ami-046f49b550ce90d8a +ap-east-1: ami-0ec0d099b8a276aec +ap-northeast-1: ami-0a13402dc88c19be2 +ap-northeast-2: ami-0bdfbd3521caa5dd2 ap-northeast-3: UNSUPPORTED -ap-south-1: ami-06cc0e0c03bd6abd9 -ap-southeast-1: ami-005fd4e1d2ccdd8a9 -ap-southeast-2: ami-0256ef02207960118 -ca-central-1: ami-08f2692bcab1f1660 -cn-north-1: ami-0b03a127af1f7956c -cn-northwest-1: ami-0beb9d63e7fe38381 -eu-central-1: ami-04d384114ab202c13 -eu-north-1: ami-01afd40fed001ed87 -eu-west-1: ami-04627ea002a11c93c -eu-west-2: ami-0953845dcd74d8d34 -eu-west-3: ami-01525b6212f1a191b -sa-east-1: ami-0cabfea2e0cf31af2 -us-east-1: ami-018cd948cda2d7384 -us-east-2: ami-08aa991eca6bf394a -us-gov-east-1: ami-0cbbce3284f341759 -us-gov-west-1: ami-01b605af54e3fdd9e -us-west-1: ami-04b508b1dae54310b -us-west-2: ami-0f085dccfc6937af7 +ap-south-1: ami-0059d599d21636768 +ap-southeast-1: ami-074f58cccc7ebb68f +ap-southeast-2: ami-04b4a20ee9f67608f +ca-central-1: ami-0523a9bc4151ee96e +cn-north-1: ami-0a12307d2d0ddc535 +cn-northwest-1: ami-0ef379c7fd5eb332e +eu-central-1: ami-07055c21834b0bc56 +eu-north-1: ami-0fe475ca307943eb1 +eu-south-1: ami-042fc69d71433a75c +eu-west-1: ami-063ac3df7f8595751 +eu-west-2: ami-086111e12527fa455 +eu-west-3: ami-0258a5a8320ccfa42 +me-south-1: ami-0c98692d98eb38c50 +sa-east-1: ami-0f463bcf6d86cad85 +us-east-1: ami-0b71488efbe422723 +us-east-2: ami-0075df3faa5b6e07e +us-gov-east-1: ami-057f7c2d5a1ca7b7d +us-gov-west-1: ami-01222b796bafd609f +us-west-1: ami-01c4b0b6d5597b80b +us-west-2: ami-079facc5ab3fdf701 # centos7 -ap-east-1: ami-0f7168e4940a70237 -ap-northeast-1: ami-07555b739b7d7f81c -ap-northeast-2: ami-0c0df7a4adaebab82 +af-south-1: ami-0e0fb5acd64f2be5e +ap-east-1: ami-0d6f16d7fceae84ee +ap-northeast-1: ami-03a451be7ebcc159e +ap-northeast-2: ami-0ba51cb6c4ceae756 ap-northeast-3: UNSUPPORTED -ap-south-1: ami-0d4e69b97911c88aa -ap-southeast-1: ami-0e715873ed4fdf15a -ap-southeast-2: ami-04ba6428ce77e728a -ca-central-1: ami-08f798212e2b9dcc9 +ap-south-1: ami-0919124b7770af8d9 +ap-southeast-1: ami-021e3e90b0458781f +ap-southeast-2: ami-0f5a161afeed62a35 +ca-central-1: ami-057fc92460a096dab cn-north-1: UNSUPPORTED cn-northwest-1: UNSUPPORTED -eu-central-1: ami-0b7dd5253f4218850 -eu-north-1: ami-01d49e3863d1f2d11 -eu-west-1: ami-009bfc0fb11ad5c6c -eu-west-2: ami-0100afbc14f4008ce -eu-west-3: ami-0b2a108cb09e900ac -sa-east-1: ami-0dde4970d46cc7abc -us-east-1: ami-0bbd714da0c7763e2 -us-east-2: ami-0b3d403d025f9ba61 +eu-central-1: ami-0a0f1b95d41e6a651 +eu-north-1: ami-0c191b20554866575 +eu-south-1: ami-03276d70dacf1a574 +eu-west-1: ami-000a3d84d3c77fdb3 +eu-west-2: ami-0a347ff9b26c5e34c +eu-west-3: ami-02f8c6e2622c3804f +me-south-1: ami-0a64f83dd01c08dab +sa-east-1: ami-082aafa914dc04479 +us-east-1: ami-0516dc2ba9f4fc177 +us-east-2: ami-07d1461ceceb4df43 us-gov-east-1: UNSUPPORTED us-gov-west-1: UNSUPPORTED -us-west-1: ami-0e589824d8d821ce0 -us-west-2: ami-0f386f4a170027bbb +us-west-1: ami-0a426e145ced105df +us-west-2: ami-0e92bdb4aee551791 # centos8 -ap-east-1: ami-081b80aa11a8a4c50 -ap-northeast-1: ami-09151d65b9344095c -ap-northeast-2: ami-09e91a17a5454daf8 +af-south-1: ami-0d881910a58319c15 +ap-east-1: ami-0ca1228e9ddcfa963 +ap-northeast-1: ami-075b3892ecd63214f +ap-northeast-2: ami-02287a8528bba818d ap-northeast-3: UNSUPPORTED -ap-south-1: ami-055bc2174e27b050e -ap-southeast-1: ami-0f221ad56e980433c -ap-southeast-2: ami-085265c078d86c75b -ca-central-1: ami-0fad54150dd6e4d1a +ap-south-1: ami-0602d9f62b83744f5 +ap-southeast-1: ami-0e789f414c14f6332 +ap-southeast-2: ami-0d1d7229f7b73a5de +ca-central-1: ami-03ea006f20d390940 cn-north-1: UNSUPPORTED cn-northwest-1: UNSUPPORTED -eu-central-1: ami-0d986e39b68bd9add -eu-north-1: ami-014f4cd90479d0c71 -eu-west-1: ami-09a3a5c16f6c84015 -eu-west-2: ami-09956ce8a4fda6388 -eu-west-3: ami-08454ae2840055567 -sa-east-1: ami-0f2a6ece6191f85e8 -us-east-1: ami-0615c0838767b8e00 -us-east-2: ami-0a07f43c650ad20e9 +eu-central-1: ami-02277a3e208351cc6 +eu-north-1: ami-00b01d23b71fff297 +eu-south-1: ami-0ffbb41ac00ef5dff +eu-west-1: ami-04a5c9c04faa5b9f7 +eu-west-2: ami-017fab72fc1db3851 +eu-west-3: ami-041e8760a7b80cfde +me-south-1: ami-0e30fd8da5a1ddc84 +sa-east-1: ami-005096f26ee208519 +us-east-1: ami-0f56d9873066cb6b9 +us-east-2: ami-073e63c94c971cc69 us-gov-east-1: UNSUPPORTED us-gov-west-1: UNSUPPORTED -us-west-1: ami-0e71f40a3e18cb184 -us-west-2: ami-0668454eee630c595 +us-west-1: ami-0f3085000b53339e0 +us-west-2: ami-029dc099ae4e121f1 # ubuntu1604 -ap-east-1: ami-0e4408eaab978d69d -ap-northeast-1: ami-021df455e65a94ba1 -ap-northeast-2: ami-02a8aa869263d3268 +af-south-1: ami-041e26e4bfed8cd7b +ap-east-1: ami-04539233794f181f7 +ap-northeast-1: ami-01fe6e75948fa65df +ap-northeast-2: ami-05dd0b07a0e8cf644 ap-northeast-3: UNSUPPORTED -ap-south-1: ami-0b939ee54f0c676de -ap-southeast-1: ami-04b44a45309e8f7b7 -ap-southeast-2: ami-06367ff848861ddfc -ca-central-1: ami-0438f2fca91841651 -cn-north-1: ami-0e4c187d835abb08b -cn-northwest-1: ami-0be4b9f412690daa2 -eu-central-1: ami-0f81af77a1e347a75 -eu-north-1: ami-0875c09d8a230cc3f -eu-west-1: ami-0da1dd9a8b40ee87a -eu-west-2: ami-0519e22cbfd281d2c -eu-west-3: ami-0bf33211929a48c9f -sa-east-1: ami-0c337a8051bd0ed6f -us-east-1: ami-04ce9ff46c759ffa8 -us-east-2: ami-0eb0a40959685a105 -us-gov-east-1: ami-09bdb46f2643fefbd -us-gov-west-1: ami-0b790f0b4c3856aa3 -us-west-1: ami-012699e63307886c5 -us-west-2: ami-08b09c217d19d5e1a +ap-south-1: ami-06aec0a1241e29730 +ap-southeast-1: ami-0f35154dc0071f71b +ap-southeast-2: ami-0da4c39e17cdfef89 +ca-central-1: ami-098c762477bc6b1fb +cn-north-1: ami-0a3f41e4d89bdff32 +cn-northwest-1: ami-0d304a0c5d04ac4e2 +eu-central-1: ami-0df794834b461ba22 +eu-north-1: ami-06975933696e0263a +eu-south-1: ami-0842531296e56778e +eu-west-1: ami-0d9c6bf221068c7c3 +eu-west-2: ami-0d2a3fa50134294e9 +eu-west-3: ami-010b0bc4570ec96a3 +me-south-1: ami-08d2926b1669d79d2 +sa-east-1: ami-0ae46391f64fa0b71 +us-east-1: ami-0b3dfe986b324a1bf +us-east-2: ami-04fd4dda7bb2fcaff +us-gov-east-1: ami-00150b953797bdaa4 +us-gov-west-1: ami-06bfdc6f4185351c5 +us-west-1: ami-00fbdde9fb06d3b09 +us-west-2: ami-008383c0ab2a2d425 # ubuntu1804 -ap-east-1: ami-05bd159bdb29e9aa9 -ap-northeast-1: ami-0f99a29d4392ac446 -ap-northeast-2: ami-0556b2de682ca759d +af-south-1: ami-04280c7f4bee35afe +ap-east-1: ami-0d16f6585c134b76d +ap-northeast-1: ami-096205fd8c1ea23b8 +ap-northeast-2: ami-04ef860d893888eee ap-northeast-3: UNSUPPORTED -ap-south-1: ami-059ddcf5c8e408012 -ap-southeast-1: ami-04878f0e013df9869 -ap-southeast-2: ami-02fa7e4f4b1073823 -ca-central-1: ami-0e6aa6758cd219754 -cn-north-1: ami-099431ae52fbcc1b9 -cn-northwest-1: ami-09277e9f7bb212c56 -eu-central-1: ami-0928231bd3b6a52b2 -eu-north-1: ami-0dfc3abcaeeafebd6 -eu-west-1: ami-06065b90e25ef853b -eu-west-2: ami-0a26ef861ba74b872 -eu-west-3: ami-0ee11f5712078f6e3 -sa-east-1: ami-0ff7ab2d586a7f0a6 -us-east-1: ami-05b80d924accf2dac -us-east-2: ami-05e89121e9222b5c6 -us-gov-east-1: ami-0b540f46f29ef9019 -us-gov-west-1: ami-00c69b11f502f4f08 -us-west-1: ami-0339ba2c62b77a99e -us-west-2: ami-036a032a9f6c44f84 +ap-south-1: ami-09fe484636519a7fd +ap-southeast-1: ami-0d2894ee85aac22c0 +ap-southeast-2: ami-07de67a2c91b2605c +ca-central-1: ami-0b8b3c3a561758ae3 +cn-north-1: ami-0abc7e40f18e6cda4 +cn-northwest-1: ami-0f52a155923e4de7f +eu-central-1: ami-07d1489352b517f39 +eu-north-1: ami-0c47a559ed268c649 +eu-south-1: ami-01cfe122f3044a34d +eu-west-1: ami-0cbef8b383ddeff80 +eu-west-2: ami-08d337feaf0f0a59f +eu-west-3: ami-074fae2e6420b8ac7 +me-south-1: ami-0d7bc19407d2b26a9 +sa-east-1: ami-0b7b49d35034bbc2f +us-east-1: ami-009fdaa0002906c5b +us-east-2: ami-0ec51c20170525d3f +us-gov-east-1: ami-04a13dedb7a0a1cfa +us-gov-west-1: ami-0be2fc1895e4b4d9f +us-west-1: ami-04ac69ccbff147270 +us-west-2: ami-01a7264e2e3bf272f ## arm64 # alinux +af-south-1: UNSUPPORTED ap-east-1: UNSUPPORTED ap-northeast-1: UNSUPPORTED ap-northeast-2: UNSUPPORTED @@ -151,9 +170,11 @@ cn-north-1: UNSUPPORTED cn-northwest-1: UNSUPPORTED eu-central-1: UNSUPPORTED eu-north-1: UNSUPPORTED +eu-south-1: UNSUPPORTED eu-west-1: UNSUPPORTED eu-west-2: UNSUPPORTED eu-west-3: UNSUPPORTED +me-south-1: UNSUPPORTED sa-east-1: UNSUPPORTED us-east-1: UNSUPPORTED us-east-2: UNSUPPORTED @@ -162,29 +183,33 @@ us-gov-west-1: UNSUPPORTED us-west-1: UNSUPPORTED us-west-2: UNSUPPORTED # alinux2 -ap-east-1: ami-0afa2c302be613354 -ap-northeast-1: ami-0d90445a8e0dd846e -ap-northeast-2: ami-012482cac933631dd +af-south-1: ami-00cd9a9915d5abf79 +ap-east-1: ami-0f73d5986d43564e2 +ap-northeast-1: ami-0802175acef9f342e +ap-northeast-2: ami-0b67c1511fe4c46e1 ap-northeast-3: UNSUPPORTED -ap-south-1: ami-0e66f1d2824238a00 -ap-southeast-1: ami-0944c0f8c608a1dbe -ap-southeast-2: ami-059405472e209bf90 -ca-central-1: ami-075d7c38d2c7f2347 +ap-south-1: ami-0e75152f8053094c3 +ap-southeast-1: ami-0da4635ceb6d846d9 +ap-southeast-2: ami-0d7a81adbdde9dce5 +ca-central-1: ami-070c24fa069a27265 cn-north-1: UNSUPPORTED cn-northwest-1: UNSUPPORTED -eu-central-1: ami-0d99e8ab1b493e9bc -eu-north-1: ami-064946adc4b641961 -eu-west-1: ami-0bb09e10d8b5cb747 -eu-west-2: ami-0f857639eb82d291e -eu-west-3: ami-0b606e6e56b851d66 -sa-east-1: ami-010d19c007ce5a2a4 -us-east-1: ami-0e96b1f22bc4ded89 -us-east-2: ami-065f40fae77a9fb41 +eu-central-1: ami-0331d559f079efd03 +eu-north-1: ami-03e4cc7f565c8efec +eu-south-1: ami-0392d65ba7c8af2b5 +eu-west-1: ami-0b86c4a8da59d6d11 +eu-west-2: ami-039c5ad24328545a9 +eu-west-3: ami-067a371d97ca6fcf2 +me-south-1: ami-0ea4909cdeeef4b03 +sa-east-1: ami-013861b8a2b1a63b5 +us-east-1: ami-0b1f998cf2b1498db +us-east-2: ami-059703a477566540c us-gov-east-1: UNSUPPORTED us-gov-west-1: UNSUPPORTED -us-west-1: ami-0715d8a272300c41f -us-west-2: ami-01c9e5995bc8ee16a +us-west-1: ami-08540a991b0cd29bd +us-west-2: ami-0257f455a26d9ed84 # centos7 +af-south-1: UNSUPPORTED ap-east-1: UNSUPPORTED ap-northeast-1: UNSUPPORTED ap-northeast-2: UNSUPPORTED @@ -197,9 +222,11 @@ cn-north-1: UNSUPPORTED cn-northwest-1: UNSUPPORTED eu-central-1: UNSUPPORTED eu-north-1: UNSUPPORTED +eu-south-1: UNSUPPORTED eu-west-1: UNSUPPORTED eu-west-2: UNSUPPORTED eu-west-3: UNSUPPORTED +me-south-1: UNSUPPORTED sa-east-1: UNSUPPORTED us-east-1: UNSUPPORTED us-east-2: UNSUPPORTED @@ -208,29 +235,33 @@ us-gov-west-1: UNSUPPORTED us-west-1: UNSUPPORTED us-west-2: UNSUPPORTED # centos8 -ap-east-1: ami-090b2a584a3f7f287 -ap-northeast-1: ami-082dd4b62e4501fd3 -ap-northeast-2: ami-0a1d5d4acc59b8c42 +af-south-1: ami-0b0a12580b8fcdfe5 +ap-east-1: ami-007ed99f82a962cc1 +ap-northeast-1: ami-0490e30625fc3466f +ap-northeast-2: ami-03222a04de4228c08 ap-northeast-3: UNSUPPORTED -ap-south-1: ami-07ce7adc39b9e28b2 -ap-southeast-1: ami-04a794d883bdaf470 -ap-southeast-2: ami-0a4a36662eaed020f -ca-central-1: ami-0f6f89bc747dc927c +ap-south-1: ami-0075369b2abf46b05 +ap-southeast-1: ami-086fd32165ac4e0a2 +ap-southeast-2: ami-07fb6c0c050a39292 +ca-central-1: ami-04be4e45fe60b19e1 cn-north-1: UNSUPPORTED cn-northwest-1: UNSUPPORTED -eu-central-1: ami-0ff2b3c8975a3db85 -eu-north-1: ami-0047a557a82efb0d0 -eu-west-1: ami-0163897289a397a7b -eu-west-2: ami-041508f55e7353f16 -eu-west-3: ami-08b703b2dce4d24db -sa-east-1: ami-00cb9fa66203e7004 -us-east-1: ami-02597a6058d9bb415 -us-east-2: ami-0b833d4b88b0df37f +eu-central-1: ami-02ea94742b76197a4 +eu-north-1: ami-052bb6243fc4b84a3 +eu-south-1: ami-015ae8f55912771fe +eu-west-1: ami-0773e67d6a7466681 +eu-west-2: ami-009ac53f9323b9865 +eu-west-3: ami-021787c0ad5f60f6b +me-south-1: ami-085892d9aec1ed9f9 +sa-east-1: ami-0d3e6fbc43aaa308c +us-east-1: ami-02839a5871b4ec582 +us-east-2: ami-0d4f0890ef069afc0 us-gov-east-1: UNSUPPORTED us-gov-west-1: UNSUPPORTED -us-west-1: ami-00aa1f310b54e3cd5 -us-west-2: ami-0e6ed2c534bab4d35 +us-west-1: ami-0aaeb3457c5f4a511 +us-west-2: ami-02a5f2d441ae4ea8e # ubuntu1604 +af-south-1: UNSUPPORTED ap-east-1: UNSUPPORTED ap-northeast-1: UNSUPPORTED ap-northeast-2: UNSUPPORTED @@ -243,9 +274,11 @@ cn-north-1: UNSUPPORTED cn-northwest-1: UNSUPPORTED eu-central-1: UNSUPPORTED eu-north-1: UNSUPPORTED +eu-south-1: UNSUPPORTED eu-west-1: UNSUPPORTED eu-west-2: UNSUPPORTED eu-west-3: UNSUPPORTED +me-south-1: UNSUPPORTED sa-east-1: UNSUPPORTED us-east-1: UNSUPPORTED us-east-2: UNSUPPORTED @@ -254,25 +287,28 @@ us-gov-west-1: UNSUPPORTED us-west-1: UNSUPPORTED us-west-2: UNSUPPORTED # ubuntu1804 -ap-east-1: ami-026aa8e70fb166ea9 -ap-northeast-1: ami-0bb8f711d35faf564 -ap-northeast-2: ami-001e5f95b7285db48 +af-south-1: ami-046d98477dd9fc7eb +ap-east-1: ami-0d2624d4b9e6970e7 +ap-northeast-1: ami-08efda47b57a7c048 +ap-northeast-2: ami-05ae60c35b06f34e5 ap-northeast-3: UNSUPPORTED -ap-south-1: ami-060347326df2488ce -ap-southeast-1: ami-0e8da59005bbebc4c -ap-southeast-2: ami-03e63532571847086 -ca-central-1: ami-0e28334f9f98bce1c +ap-south-1: ami-0ef07d3677758fee7 +ap-southeast-1: ami-052b40fbf4a7d852f +ap-southeast-2: ami-02a57f04d151d2ce2 +ca-central-1: ami-0bef62a8de524cc3d cn-north-1: UNSUPPORTED cn-northwest-1: UNSUPPORTED -eu-central-1: ami-0bdcdad522abee324 -eu-north-1: ami-0abb2f4b0fb7f9419 -eu-west-1: ami-07b73abb892002aba -eu-west-2: ami-0c9501e4fae3574e1 -eu-west-3: ami-0fe23436c8cd7c55d -sa-east-1: ami-0e2ac51ba079f6df3 -us-east-1: ami-0bb7443216d8ea706 -us-east-2: ami-02a92e06fd643c11b +eu-central-1: ami-0ab72b2539a8e4eaf +eu-north-1: ami-00315391fc87d6a4a +eu-south-1: ami-00d476744294f458a +eu-west-1: ami-0224e61822e0603ba +eu-west-2: ami-06496496998c0480a +eu-west-3: ami-0d16162157fa31ea4 +me-south-1: ami-0ce1e2728e4ba14c6 +sa-east-1: ami-079a46ad6559d1023 +us-east-1: ami-0a5c0725ce4d960f1 +us-east-2: ami-08776e764b05c8fa7 us-gov-east-1: UNSUPPORTED us-gov-west-1: UNSUPPORTED -us-west-1: ami-084f3531a939fa437 -us-west-2: ami-0a81e33be478706e1 +us-west-1: ami-0ca03ec3eca322ace +us-west-2: ami-0f67bacfcdf1f2374 diff --git a/cloudformation/aws-parallelcluster.cfn.json b/cloudformation/aws-parallelcluster.cfn.json index 9856065201..4f9c98eba3 100644 --- a/cloudformation/aws-parallelcluster.cfn.json +++ b/cloudformation/aws-parallelcluster.cfn.json @@ -814,29 +814,37 @@ }, "Mappings": { "AWSRegionOS2AMIarm64": { + "af-south-1": { + "alinux": "UNSUPPORTED", + "alinux2": "ami-00cd9a9915d5abf79", + "centos7": "UNSUPPORTED", + "centos8": "ami-0b0a12580b8fcdfe5", + "ubuntu1604": "UNSUPPORTED", + "ubuntu1804": "ami-046d98477dd9fc7eb" + }, "ap-east-1": { "alinux": "UNSUPPORTED", - "alinux2": "ami-0afa2c302be613354", + "alinux2": "ami-0f73d5986d43564e2", "centos7": "UNSUPPORTED", - "centos8": "ami-090b2a584a3f7f287", + "centos8": "ami-007ed99f82a962cc1", "ubuntu1604": "UNSUPPORTED", - "ubuntu1804": "ami-026aa8e70fb166ea9" + "ubuntu1804": "ami-0d2624d4b9e6970e7" }, "ap-northeast-1": { "alinux": "UNSUPPORTED", - "alinux2": "ami-0d90445a8e0dd846e", + "alinux2": "ami-0802175acef9f342e", "centos7": "UNSUPPORTED", - "centos8": "ami-082dd4b62e4501fd3", + "centos8": "ami-0490e30625fc3466f", "ubuntu1604": "UNSUPPORTED", - "ubuntu1804": "ami-0bb8f711d35faf564" + "ubuntu1804": "ami-08efda47b57a7c048" }, "ap-northeast-2": { "alinux": "UNSUPPORTED", - "alinux2": "ami-012482cac933631dd", + "alinux2": "ami-0b67c1511fe4c46e1", "centos7": "UNSUPPORTED", - "centos8": "ami-0a1d5d4acc59b8c42", + "centos8": "ami-03222a04de4228c08", "ubuntu1604": "UNSUPPORTED", - "ubuntu1804": "ami-001e5f95b7285db48" + "ubuntu1804": "ami-05ae60c35b06f34e5" }, "ap-northeast-3": { "alinux": "UNSUPPORTED", @@ -848,35 +856,35 @@ }, "ap-south-1": { "alinux": "UNSUPPORTED", - "alinux2": "ami-0e66f1d2824238a00", + "alinux2": "ami-0e75152f8053094c3", "centos7": "UNSUPPORTED", - "centos8": "ami-07ce7adc39b9e28b2", + "centos8": "ami-0075369b2abf46b05", "ubuntu1604": "UNSUPPORTED", - "ubuntu1804": "ami-060347326df2488ce" + "ubuntu1804": "ami-0ef07d3677758fee7" }, "ap-southeast-1": { "alinux": "UNSUPPORTED", - "alinux2": "ami-0944c0f8c608a1dbe", + "alinux2": "ami-0da4635ceb6d846d9", "centos7": "UNSUPPORTED", - "centos8": "ami-04a794d883bdaf470", + "centos8": "ami-086fd32165ac4e0a2", "ubuntu1604": "UNSUPPORTED", - "ubuntu1804": "ami-0e8da59005bbebc4c" + "ubuntu1804": "ami-052b40fbf4a7d852f" }, "ap-southeast-2": { "alinux": "UNSUPPORTED", - "alinux2": "ami-059405472e209bf90", + "alinux2": "ami-0d7a81adbdde9dce5", "centos7": "UNSUPPORTED", - "centos8": "ami-0a4a36662eaed020f", + "centos8": "ami-07fb6c0c050a39292", "ubuntu1604": "UNSUPPORTED", - "ubuntu1804": "ami-03e63532571847086" + "ubuntu1804": "ami-02a57f04d151d2ce2" }, "ca-central-1": { "alinux": "UNSUPPORTED", - "alinux2": "ami-075d7c38d2c7f2347", + "alinux2": "ami-070c24fa069a27265", "centos7": "UNSUPPORTED", - "centos8": "ami-0f6f89bc747dc927c", + "centos8": "ami-04be4e45fe60b19e1", "ubuntu1604": "UNSUPPORTED", - "ubuntu1804": "ami-0e28334f9f98bce1c" + "ubuntu1804": "ami-0bef62a8de524cc3d" }, "cn-north-1": { "alinux": "UNSUPPORTED", @@ -896,67 +904,83 @@ }, "eu-central-1": { "alinux": "UNSUPPORTED", - "alinux2": "ami-0d99e8ab1b493e9bc", + "alinux2": "ami-0331d559f079efd03", "centos7": "UNSUPPORTED", - "centos8": "ami-0ff2b3c8975a3db85", + "centos8": "ami-02ea94742b76197a4", "ubuntu1604": "UNSUPPORTED", - "ubuntu1804": "ami-0bdcdad522abee324" + "ubuntu1804": "ami-0ab72b2539a8e4eaf" }, "eu-north-1": { "alinux": "UNSUPPORTED", - "alinux2": "ami-064946adc4b641961", + "alinux2": "ami-03e4cc7f565c8efec", + "centos7": "UNSUPPORTED", + "centos8": "ami-052bb6243fc4b84a3", + "ubuntu1604": "UNSUPPORTED", + "ubuntu1804": "ami-00315391fc87d6a4a" + }, + "eu-south-1": { + "alinux": "UNSUPPORTED", + "alinux2": "ami-0392d65ba7c8af2b5", "centos7": "UNSUPPORTED", - "centos8": "ami-0047a557a82efb0d0", + "centos8": "ami-015ae8f55912771fe", "ubuntu1604": "UNSUPPORTED", - "ubuntu1804": "ami-0abb2f4b0fb7f9419" + "ubuntu1804": "ami-00d476744294f458a" }, "eu-west-1": { "alinux": "UNSUPPORTED", - "alinux2": "ami-0bb09e10d8b5cb747", + "alinux2": "ami-0b86c4a8da59d6d11", "centos7": "UNSUPPORTED", - "centos8": "ami-0163897289a397a7b", + "centos8": "ami-0773e67d6a7466681", "ubuntu1604": "UNSUPPORTED", - "ubuntu1804": "ami-07b73abb892002aba" + "ubuntu1804": "ami-0224e61822e0603ba" }, "eu-west-2": { "alinux": "UNSUPPORTED", - "alinux2": "ami-0f857639eb82d291e", + "alinux2": "ami-039c5ad24328545a9", "centos7": "UNSUPPORTED", - "centos8": "ami-041508f55e7353f16", + "centos8": "ami-009ac53f9323b9865", "ubuntu1604": "UNSUPPORTED", - "ubuntu1804": "ami-0c9501e4fae3574e1" + "ubuntu1804": "ami-06496496998c0480a" }, "eu-west-3": { "alinux": "UNSUPPORTED", - "alinux2": "ami-0b606e6e56b851d66", + "alinux2": "ami-067a371d97ca6fcf2", "centos7": "UNSUPPORTED", - "centos8": "ami-08b703b2dce4d24db", + "centos8": "ami-021787c0ad5f60f6b", "ubuntu1604": "UNSUPPORTED", - "ubuntu1804": "ami-0fe23436c8cd7c55d" + "ubuntu1804": "ami-0d16162157fa31ea4" + }, + "me-south-1": { + "alinux": "UNSUPPORTED", + "alinux2": "ami-0ea4909cdeeef4b03", + "centos7": "UNSUPPORTED", + "centos8": "ami-085892d9aec1ed9f9", + "ubuntu1604": "UNSUPPORTED", + "ubuntu1804": "ami-0ce1e2728e4ba14c6" }, "sa-east-1": { "alinux": "UNSUPPORTED", - "alinux2": "ami-010d19c007ce5a2a4", + "alinux2": "ami-013861b8a2b1a63b5", "centos7": "UNSUPPORTED", - "centos8": "ami-00cb9fa66203e7004", + "centos8": "ami-0d3e6fbc43aaa308c", "ubuntu1604": "UNSUPPORTED", - "ubuntu1804": "ami-0e2ac51ba079f6df3" + "ubuntu1804": "ami-079a46ad6559d1023" }, "us-east-1": { "alinux": "UNSUPPORTED", - "alinux2": "ami-0e96b1f22bc4ded89", + "alinux2": "ami-0b1f998cf2b1498db", "centos7": "UNSUPPORTED", - "centos8": "ami-02597a6058d9bb415", + "centos8": "ami-02839a5871b4ec582", "ubuntu1604": "UNSUPPORTED", - "ubuntu1804": "ami-0bb7443216d8ea706" + "ubuntu1804": "ami-0a5c0725ce4d960f1" }, "us-east-2": { "alinux": "UNSUPPORTED", - "alinux2": "ami-065f40fae77a9fb41", + "alinux2": "ami-059703a477566540c", "centos7": "UNSUPPORTED", - "centos8": "ami-0b833d4b88b0df37f", + "centos8": "ami-0d4f0890ef069afc0", "ubuntu1604": "UNSUPPORTED", - "ubuntu1804": "ami-02a92e06fd643c11b" + "ubuntu1804": "ami-08776e764b05c8fa7" }, "us-gov-east-1": { "alinux": "UNSUPPORTED", @@ -976,45 +1000,53 @@ }, "us-west-1": { "alinux": "UNSUPPORTED", - "alinux2": "ami-0715d8a272300c41f", + "alinux2": "ami-08540a991b0cd29bd", "centos7": "UNSUPPORTED", - "centos8": "ami-00aa1f310b54e3cd5", + "centos8": "ami-0aaeb3457c5f4a511", "ubuntu1604": "UNSUPPORTED", - "ubuntu1804": "ami-084f3531a939fa437" + "ubuntu1804": "ami-0ca03ec3eca322ace" }, "us-west-2": { "alinux": "UNSUPPORTED", - "alinux2": "ami-01c9e5995bc8ee16a", + "alinux2": "ami-0257f455a26d9ed84", "centos7": "UNSUPPORTED", - "centos8": "ami-0e6ed2c534bab4d35", + "centos8": "ami-02a5f2d441ae4ea8e", "ubuntu1604": "UNSUPPORTED", - "ubuntu1804": "ami-0a81e33be478706e1" + "ubuntu1804": "ami-0f67bacfcdf1f2374" } }, "AWSRegionOS2AMIx86": { + "af-south-1": { + "alinux": "ami-0f2e2135a05f814df", + "alinux2": "ami-046f49b550ce90d8a", + "centos7": "ami-0e0fb5acd64f2be5e", + "centos8": "ami-0d881910a58319c15", + "ubuntu1604": "ami-041e26e4bfed8cd7b", + "ubuntu1804": "ami-04280c7f4bee35afe" + }, "ap-east-1": { - "alinux": "ami-086d023ae9be29265", - "alinux2": "ami-060927fff43c77a88", - "centos7": "ami-0f7168e4940a70237", - "centos8": "ami-081b80aa11a8a4c50", - "ubuntu1604": "ami-0e4408eaab978d69d", - "ubuntu1804": "ami-05bd159bdb29e9aa9" + "alinux": "ami-05ef7cb79d3a43092", + "alinux2": "ami-0ec0d099b8a276aec", + "centos7": "ami-0d6f16d7fceae84ee", + "centos8": "ami-0ca1228e9ddcfa963", + "ubuntu1604": "ami-04539233794f181f7", + "ubuntu1804": "ami-0d16f6585c134b76d" }, "ap-northeast-1": { - "alinux": "ami-071d1f9ecf81d5866", - "alinux2": "ami-068b3b3104ae04d62", - "centos7": "ami-07555b739b7d7f81c", - "centos8": "ami-09151d65b9344095c", - "ubuntu1604": "ami-021df455e65a94ba1", - "ubuntu1804": "ami-0f99a29d4392ac446" + "alinux": "ami-006b7f4c929aaf4a9", + "alinux2": "ami-0a13402dc88c19be2", + "centos7": "ami-03a451be7ebcc159e", + "centos8": "ami-075b3892ecd63214f", + "ubuntu1604": "ami-01fe6e75948fa65df", + "ubuntu1804": "ami-096205fd8c1ea23b8" }, "ap-northeast-2": { - "alinux": "ami-0f13dd00403ae009c", - "alinux2": "ami-07cd9137f04b28895", - "centos7": "ami-0c0df7a4adaebab82", - "centos8": "ami-09e91a17a5454daf8", - "ubuntu1604": "ami-02a8aa869263d3268", - "ubuntu1804": "ami-0556b2de682ca759d" + "alinux": "ami-06a49824cc501a981", + "alinux2": "ami-0bdfbd3521caa5dd2", + "centos7": "ami-0ba51cb6c4ceae756", + "centos8": "ami-02287a8528bba818d", + "ubuntu1604": "ami-05dd0b07a0e8cf644", + "ubuntu1804": "ami-04ef860d893888eee" }, "ap-northeast-3": { "alinux": "UNSUPPORTED", @@ -1025,148 +1057,164 @@ "ubuntu1804": "UNSUPPORTED" }, "ap-south-1": { - "alinux": "ami-09952eb2d4dd3f2e2", - "alinux2": "ami-06cc0e0c03bd6abd9", - "centos7": "ami-0d4e69b97911c88aa", - "centos8": "ami-055bc2174e27b050e", - "ubuntu1604": "ami-0b939ee54f0c676de", - "ubuntu1804": "ami-059ddcf5c8e408012" + "alinux": "ami-0a47fd68cf7034c58", + "alinux2": "ami-0059d599d21636768", + "centos7": "ami-0919124b7770af8d9", + "centos8": "ami-0602d9f62b83744f5", + "ubuntu1604": "ami-06aec0a1241e29730", + "ubuntu1804": "ami-09fe484636519a7fd" }, "ap-southeast-1": { - "alinux": "ami-043d99842781aff9d", - "alinux2": "ami-005fd4e1d2ccdd8a9", - "centos7": "ami-0e715873ed4fdf15a", - "centos8": "ami-0f221ad56e980433c", - "ubuntu1604": "ami-04b44a45309e8f7b7", - "ubuntu1804": "ami-04878f0e013df9869" + "alinux": "ami-0e6cfdde386164836", + "alinux2": "ami-074f58cccc7ebb68f", + "centos7": "ami-021e3e90b0458781f", + "centos8": "ami-0e789f414c14f6332", + "ubuntu1604": "ami-0f35154dc0071f71b", + "ubuntu1804": "ami-0d2894ee85aac22c0" }, "ap-southeast-2": { - "alinux": "ami-02656322bc3cae72c", - "alinux2": "ami-0256ef02207960118", - "centos7": "ami-04ba6428ce77e728a", - "centos8": "ami-085265c078d86c75b", - "ubuntu1604": "ami-06367ff848861ddfc", - "ubuntu1804": "ami-02fa7e4f4b1073823" + "alinux": "ami-0ba7f788162c4a4de", + "alinux2": "ami-04b4a20ee9f67608f", + "centos7": "ami-0f5a161afeed62a35", + "centos8": "ami-0d1d7229f7b73a5de", + "ubuntu1604": "ami-0da4c39e17cdfef89", + "ubuntu1804": "ami-07de67a2c91b2605c" }, "ca-central-1": { - "alinux": "ami-0a853d47df00d434a", - "alinux2": "ami-08f2692bcab1f1660", - "centos7": "ami-08f798212e2b9dcc9", - "centos8": "ami-0fad54150dd6e4d1a", - "ubuntu1604": "ami-0438f2fca91841651", - "ubuntu1804": "ami-0e6aa6758cd219754" + "alinux": "ami-0808f2df200c7006c", + "alinux2": "ami-0523a9bc4151ee96e", + "centos7": "ami-057fc92460a096dab", + "centos8": "ami-03ea006f20d390940", + "ubuntu1604": "ami-098c762477bc6b1fb", + "ubuntu1804": "ami-0b8b3c3a561758ae3" }, "cn-north-1": { - "alinux": "ami-03a764b8e78792057", - "alinux2": "ami-0b03a127af1f7956c", + "alinux": "ami-050c2b7b0181fbfd5", + "alinux2": "ami-0a12307d2d0ddc535", "centos7": "UNSUPPORTED", "centos8": "UNSUPPORTED", - "ubuntu1604": "ami-0e4c187d835abb08b", - "ubuntu1804": "ami-099431ae52fbcc1b9" + "ubuntu1604": "ami-0a3f41e4d89bdff32", + "ubuntu1804": "ami-0abc7e40f18e6cda4" }, "cn-northwest-1": { - "alinux": "ami-077d4bb98d5ca5def", - "alinux2": "ami-0beb9d63e7fe38381", + "alinux": "ami-07d8dd2f175498353", + "alinux2": "ami-0ef379c7fd5eb332e", "centos7": "UNSUPPORTED", "centos8": "UNSUPPORTED", - "ubuntu1604": "ami-0be4b9f412690daa2", - "ubuntu1804": "ami-09277e9f7bb212c56" + "ubuntu1604": "ami-0d304a0c5d04ac4e2", + "ubuntu1804": "ami-0f52a155923e4de7f" }, "eu-central-1": { - "alinux": "ami-09f3ed2c18ba86996", - "alinux2": "ami-04d384114ab202c13", - "centos7": "ami-0b7dd5253f4218850", - "centos8": "ami-0d986e39b68bd9add", - "ubuntu1604": "ami-0f81af77a1e347a75", - "ubuntu1804": "ami-0928231bd3b6a52b2" + "alinux": "ami-032358687770c43e1", + "alinux2": "ami-07055c21834b0bc56", + "centos7": "ami-0a0f1b95d41e6a651", + "centos8": "ami-02277a3e208351cc6", + "ubuntu1604": "ami-0df794834b461ba22", + "ubuntu1804": "ami-07d1489352b517f39" }, "eu-north-1": { - "alinux": "ami-03d6705f566e99836", - "alinux2": "ami-01afd40fed001ed87", - "centos7": "ami-01d49e3863d1f2d11", - "centos8": "ami-014f4cd90479d0c71", - "ubuntu1604": "ami-0875c09d8a230cc3f", - "ubuntu1804": "ami-0dfc3abcaeeafebd6" + "alinux": "ami-038781ae9b21b98ca", + "alinux2": "ami-0fe475ca307943eb1", + "centos7": "ami-0c191b20554866575", + "centos8": "ami-00b01d23b71fff297", + "ubuntu1604": "ami-06975933696e0263a", + "ubuntu1804": "ami-0c47a559ed268c649" + }, + "eu-south-1": { + "alinux": "ami-0a9a6c50dc32a934f", + "alinux2": "ami-042fc69d71433a75c", + "centos7": "ami-03276d70dacf1a574", + "centos8": "ami-0ffbb41ac00ef5dff", + "ubuntu1604": "ami-0842531296e56778e", + "ubuntu1804": "ami-01cfe122f3044a34d" }, "eu-west-1": { - "alinux": "ami-02c2421d6cd745994", - "alinux2": "ami-04627ea002a11c93c", - "centos7": "ami-009bfc0fb11ad5c6c", - "centos8": "ami-09a3a5c16f6c84015", - "ubuntu1604": "ami-0da1dd9a8b40ee87a", - "ubuntu1804": "ami-06065b90e25ef853b" + "alinux": "ami-0dd799c2e1a68608e", + "alinux2": "ami-063ac3df7f8595751", + "centos7": "ami-000a3d84d3c77fdb3", + "centos8": "ami-04a5c9c04faa5b9f7", + "ubuntu1604": "ami-0d9c6bf221068c7c3", + "ubuntu1804": "ami-0cbef8b383ddeff80" }, "eu-west-2": { - "alinux": "ami-0ef2ff4ce73c7208e", - "alinux2": "ami-0953845dcd74d8d34", - "centos7": "ami-0100afbc14f4008ce", - "centos8": "ami-09956ce8a4fda6388", - "ubuntu1604": "ami-0519e22cbfd281d2c", - "ubuntu1804": "ami-0a26ef861ba74b872" + "alinux": "ami-0849a887182dd033a", + "alinux2": "ami-086111e12527fa455", + "centos7": "ami-0a347ff9b26c5e34c", + "centos8": "ami-017fab72fc1db3851", + "ubuntu1604": "ami-0d2a3fa50134294e9", + "ubuntu1804": "ami-08d337feaf0f0a59f" }, "eu-west-3": { - "alinux": "ami-07e87c1e0dd3d3e75", - "alinux2": "ami-01525b6212f1a191b", - "centos7": "ami-0b2a108cb09e900ac", - "centos8": "ami-08454ae2840055567", - "ubuntu1604": "ami-0bf33211929a48c9f", - "ubuntu1804": "ami-0ee11f5712078f6e3" + "alinux": "ami-060df914a5b5ad680", + "alinux2": "ami-0258a5a8320ccfa42", + "centos7": "ami-02f8c6e2622c3804f", + "centos8": "ami-041e8760a7b80cfde", + "ubuntu1604": "ami-010b0bc4570ec96a3", + "ubuntu1804": "ami-074fae2e6420b8ac7" + }, + "me-south-1": { + "alinux": "ami-0d68c8e916ccf0418", + "alinux2": "ami-0c98692d98eb38c50", + "centos7": "ami-0a64f83dd01c08dab", + "centos8": "ami-0e30fd8da5a1ddc84", + "ubuntu1604": "ami-08d2926b1669d79d2", + "ubuntu1804": "ami-0d7bc19407d2b26a9" }, "sa-east-1": { - "alinux": "ami-0d552ac237f838360", - "alinux2": "ami-0cabfea2e0cf31af2", - "centos7": "ami-0dde4970d46cc7abc", - "centos8": "ami-0f2a6ece6191f85e8", - "ubuntu1604": "ami-0c337a8051bd0ed6f", - "ubuntu1804": "ami-0ff7ab2d586a7f0a6" + "alinux": "ami-0d4a17532432d5aa2", + "alinux2": "ami-0f463bcf6d86cad85", + "centos7": "ami-082aafa914dc04479", + "centos8": "ami-005096f26ee208519", + "ubuntu1604": "ami-0ae46391f64fa0b71", + "ubuntu1804": "ami-0b7b49d35034bbc2f" }, "us-east-1": { - "alinux": "ami-01a1cf6f36f2bd13b", - "alinux2": "ami-018cd948cda2d7384", - "centos7": "ami-0bbd714da0c7763e2", - "centos8": "ami-0615c0838767b8e00", - "ubuntu1604": "ami-04ce9ff46c759ffa8", - "ubuntu1804": "ami-05b80d924accf2dac" + "alinux": "ami-0604e4a14869de93f", + "alinux2": "ami-0b71488efbe422723", + "centos7": "ami-0516dc2ba9f4fc177", + "centos8": "ami-0f56d9873066cb6b9", + "ubuntu1604": "ami-0b3dfe986b324a1bf", + "ubuntu1804": "ami-009fdaa0002906c5b" }, "us-east-2": { - "alinux": "ami-0a8d40acce6869be4", - "alinux2": "ami-08aa991eca6bf394a", - "centos7": "ami-0b3d403d025f9ba61", - "centos8": "ami-0a07f43c650ad20e9", - "ubuntu1604": "ami-0eb0a40959685a105", - "ubuntu1804": "ami-05e89121e9222b5c6" + "alinux": "ami-00d4efc81188687a0", + "alinux2": "ami-0075df3faa5b6e07e", + "centos7": "ami-07d1461ceceb4df43", + "centos8": "ami-073e63c94c971cc69", + "ubuntu1604": "ami-04fd4dda7bb2fcaff", + "ubuntu1804": "ami-0ec51c20170525d3f" }, "us-gov-east-1": { - "alinux": "ami-05bb72e83f0973bdc", - "alinux2": "ami-0cbbce3284f341759", + "alinux": "ami-01d57910cd71ea0c4", + "alinux2": "ami-057f7c2d5a1ca7b7d", "centos7": "UNSUPPORTED", "centos8": "UNSUPPORTED", - "ubuntu1604": "ami-09bdb46f2643fefbd", - "ubuntu1804": "ami-0b540f46f29ef9019" + "ubuntu1604": "ami-00150b953797bdaa4", + "ubuntu1804": "ami-04a13dedb7a0a1cfa" }, "us-gov-west-1": { - "alinux": "ami-0318093bb66476048", - "alinux2": "ami-01b605af54e3fdd9e", + "alinux": "ami-0cfc4f4eb94c9f403", + "alinux2": "ami-01222b796bafd609f", "centos7": "UNSUPPORTED", "centos8": "UNSUPPORTED", - "ubuntu1604": "ami-0b790f0b4c3856aa3", - "ubuntu1804": "ami-00c69b11f502f4f08" + "ubuntu1604": "ami-06bfdc6f4185351c5", + "ubuntu1804": "ami-0be2fc1895e4b4d9f" }, "us-west-1": { - "alinux": "ami-0425c898b65b066f3", - "alinux2": "ami-04b508b1dae54310b", - "centos7": "ami-0e589824d8d821ce0", - "centos8": "ami-0e71f40a3e18cb184", - "ubuntu1604": "ami-012699e63307886c5", - "ubuntu1804": "ami-0339ba2c62b77a99e" + "alinux": "ami-0c8decb747bfca25f", + "alinux2": "ami-01c4b0b6d5597b80b", + "centos7": "ami-0a426e145ced105df", + "centos8": "ami-0f3085000b53339e0", + "ubuntu1604": "ami-00fbdde9fb06d3b09", + "ubuntu1804": "ami-04ac69ccbff147270" }, "us-west-2": { - "alinux": "ami-0ab937dbac92ae27e", - "alinux2": "ami-0f085dccfc6937af7", - "centos7": "ami-0f386f4a170027bbb", - "centos8": "ami-0668454eee630c595", - "ubuntu1604": "ami-08b09c217d19d5e1a", - "ubuntu1804": "ami-036a032a9f6c44f84" + "alinux": "ami-018ccd7660ecade5e", + "alinux2": "ami-079facc5ab3fdf701", + "centos7": "ami-0e92bdb4aee551791", + "centos8": "ami-029dc099ae4e121f1", + "ubuntu1604": "ami-008383c0ab2a2d425", + "ubuntu1804": "ami-01a7264e2e3bf272f" } }, "OSFeatures": {