From b6e940852be2cf2c2e3f467187551584d92fa811 Mon Sep 17 00:00:00 2001 From: Luca Carrogu Date: Wed, 3 Apr 2019 12:38:14 +0200 Subject: [PATCH 001/121] Add possibility to generate AMI list from local input files Add possibility to generate AMI list from local cloudformation template and region list passed as input parameters The script will generate amis.txt and a new cloudformation template as usual Signed-off-by: Luca Carrogu --- util/generate-ami-list.py | 66 +++++++++++++++++++++++++++++---------- 1 file changed, 50 insertions(+), 16 deletions(-) diff --git a/util/generate-ami-list.py b/util/generate-ami-list.py index 51db5da200..b774eb4d5a 100644 --- a/util/generate-ami-list.py +++ b/util/generate-ami-list.py @@ -37,7 +37,21 @@ ) -def get_ami_list(regions, date, cookbook_git_ref, node_git_ref, version, owner): +def get_ami_list_from_file(regions, cfn_template_file): + amis_json = {} + + with open(cfn_template_file) as cfn_file: + # object_pairs_hook=OrderedDict allows to preserve input order + cfn_data = json.load(cfn_file, object_pairs_hook=OrderedDict) + + current_amis = cfn_data.get("Mappings").get("AWSRegionOS2AMI") + + for region_name in regions: + amis_json[region_name] = OrderedDict(sorted(current_amis.get(region_name).items())) + return amis_json + + +def get_ami_list_from_ec2(regions, date, cookbook_git_ref, node_git_ref, version, owner): amis_json = {} for region_name in regions: @@ -83,7 +97,20 @@ def convert_json_to_txt(amis_json): return amis_txt -def get_all_aws_regions(region): +def get_aws_regions_from_file(region_file): + # Region file format + # { + # "regions": [ + # "cn-north-1", + # "cn-northwest-1" + # ] + # } + with open(region_file) as r_file: + region_data = json.load(r_file) + return sorted(r for r in region_data.get("regions")) + + +def get_all_aws_regions_from_ec2(region): ec2 = boto3.client("ec2", region_name=region) return sorted(r.get("RegionName") for r in ec2.describe_regions().get("Regions")) @@ -117,18 +144,23 @@ def update_amis_txt(amis_txt_file, amis): if __name__ == "__main__": # parse inputs parser = argparse.ArgumentParser(description="Get AWS ParallelCluster instances and generate a json and txt file") - group1 = parser.add_argument_group("Search by version and date") + group1 = parser.add_argument_group("Retrieve instances from EC2 searching by version and date") group1.add_argument("--version", type=str, help="release version", required=False) group1.add_argument("--date", type=str, help="release date [timestamp] (e.g. 201801112350)", required=False) - group2 = parser.add_argument_group("Search by cookbook and node git reference") + group2 = parser.add_argument_group("Retrieve instances from EC2 searching by cookbook and node git reference") group2.add_argument("--cookbook-git-ref", type=str, help="cookbook git hash reference", required=False) group2.add_argument("--node-git-ref", type=str, help="node git hash reference", required=False) + group3 = parser.add_argument_group("Retrieve instances from local cfn template for given regions") + group3.add_argument("--json-template", type=str, help="path to input json cloudformation template", required=False) + group3.add_argument( + "--json-regions", type=str, help="path to input json file containing the regions", required=False + ) parser.add_argument("--txt-file", type=str, help="txt output file path", required=False, default="amis.txt") parser.add_argument("--partition", type=str, help="commercial | china | govcloud", required=True) parser.add_argument( "--cloudformation-template", type=str, - help="path to cloudfomation template", + help="path to output cloudfomation template", required=False, default="cloudformation/aws-parallelcluster.cfn.json", ) @@ -147,17 +179,19 @@ def update_amis_txt(amis_txt_file, amis): print("Unsupported partition %s" % args.partition) sys.exit(1) - regions = get_all_aws_regions(region) - - amis_dict = get_ami_list( - regions=regions, - date=args.date, - cookbook_git_ref=args.cookbook_git_ref, - node_git_ref=args.node_git_ref, - version=args.version, - owner=account_id, - ) + if (args.version and args.date) or (args.cookbook_git_ref and args.node_git_ref): + regions = get_all_aws_regions_from_ec2(region) + amis_dict = get_ami_list_from_ec2( + regions=regions, + date=args.date, + cookbook_git_ref=args.cookbook_git_ref, + node_git_ref=args.node_git_ref, + version=args.version, + owner=account_id, + ) + else: + regions = get_aws_regions_from_file(args.json_regions) + amis_dict = get_ami_list_from_file(regions, args.json_template) cfn_amis = update_cfn_template(cfn_template_file=args.cloudformation_template, amis_to_update=amis_dict) - update_amis_txt(amis_txt_file=args.txt_file, amis=cfn_amis) From f971bd47bf70d5c161646731423cc953c3638603 Mon Sep 17 00:00:00 2001 From: Enrico Usai Date: Fri, 5 Apr 2019 14:56:30 +0200 Subject: [PATCH 002/121] Integ Tests: disable iE3008 because of a cfn-python-lint issue The latest release 0.18.0 re-opened the bug: https://github.com/awslabs/cfn-python-lint/issues/564 Signed-off-by: Enrico Usai --- cli/tox.ini | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cli/tox.ini b/cli/tox.ini index 565059f727..a247c9b3a9 100644 --- a/cli/tox.ini +++ b/cli/tox.ini @@ -209,9 +209,10 @@ deps = cfn-lint # E2504 disabled since does not allow two-digit numbers in ephemeral(n) # W2507 disabled since we want to have nullable String type parameters # E2523 disabled since we have both a Launch Template and Launch Configuration +# iE3008 disabled because of https://github.com/awslabs/cfn-python-lint/issues/564 commands = cfn-lint -iE2504 -iW2507 -iE2523 aws-parallelcluster.cfn.json - cfn-lint batch-substack.cfn.json + cfn-lint -iE3008 batch-substack.cfn.json cfn-lint ebs-substack.cfn.json cfn-lint efs-substack.cfn.json cfn-lint raid-substack.cfn.json From 1876d800a08f7588328caf76d649f762e2cd47a3 Mon Sep 17 00:00:00 2001 From: Enrico Usai Date: Tue, 2 Apr 2019 11:55:39 +0200 Subject: [PATCH 003/121] Integ tests: add required python version to readme Signed-off-by: Enrico Usai --- tests/integration-tests/README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/integration-tests/README.md b/tests/integration-tests/README.md index d62b46c4ae..d1f7db70bd 100644 --- a/tests/integration-tests/README.md +++ b/tests/integration-tests/README.md @@ -11,6 +11,8 @@ config generation. ## Run Integration Tests +To run the integration tests you have to use Python 3.7. + Before executing integration tests it is required to install all the python dependencies required by the framework. In order to do that simply run the following command: ```bash From ff5d272d142046931dd532cfded20a1651abe239 Mon Sep 17 00:00:00 2001 From: Enrico Usai Date: Tue, 2 Apr 2019 12:07:28 +0200 Subject: [PATCH 004/121] Integ tests: use node.name in place of node.id for test_name I had an OSError, Invalid argument, because node.nodeid was: tests_outputs/.../test_slurm.py::test_slurm[c5.xlarge-us-west-1-alinux-slurm].config instead node.name is: tests_outputs/.../test_slurm[c5.xlarge-us-west-1-alinux-slurm].config Doc: https://docs.pytest.org/en/latest/_modules/_pytest/nodes.html Signed-off-by: Enrico Usai --- tests/integration-tests/conftest.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/integration-tests/conftest.py b/tests/integration-tests/conftest.py index 6aeb851890..4e5cbce8d5 100644 --- a/tests/integration-tests/conftest.py +++ b/tests/integration-tests/conftest.py @@ -176,11 +176,11 @@ def _cluster_factory(cluster_config): def _write_cluster_config_to_outdir(request, cluster_config): out_dir = request.config.getoption("output_dir") os.makedirs( - "{out_dir}/clusters_configs/{test_dir}".format(out_dir=out_dir, test_dir=os.path.dirname(request.node.nodeid)), + "{out_dir}/clusters_configs/{test_dir}".format(out_dir=out_dir, test_dir=os.path.dirname(request.node.name)), exist_ok=True, ) cluster_config_dst = "{out_dir}/clusters_configs/{test_name}.config".format( - out_dir=out_dir, test_name=request.node.nodeid + out_dir=out_dir, test_name=request.node.name ) copyfile(cluster_config, cluster_config_dst) return cluster_config_dst From ad112f69a374990465334e9b3c94363f2758580b Mon Sep 17 00:00:00 2001 From: Enrico Usai Date: Tue, 2 Apr 2019 16:01:04 +0200 Subject: [PATCH 005/121] Integ tests: extend clusters_factory fixture to return the factory Signed-off-by: Enrico Usai --- tests/integration-tests/conftest.py | 2 +- tests/integration-tests/tests/schedulers/test_awsbatch.py | 2 +- tests/integration-tests/tests/schedulers/test_slurm.py | 2 +- tests/integration-tests/tests/storage/test_ebs.py | 6 +++--- tests/integration-tests/tests/storage/test_fsx_lustre.py | 2 +- tests/integration-tests/tests/storage/test_raid.py | 4 ++-- tests/integration-tests/tests/test_scaling.py | 2 +- 7 files changed, 10 insertions(+), 10 deletions(-) diff --git a/tests/integration-tests/conftest.py b/tests/integration-tests/conftest.py index 4e5cbce8d5..11866cfeed 100644 --- a/tests/integration-tests/conftest.py +++ b/tests/integration-tests/conftest.py @@ -167,7 +167,7 @@ def _cluster_factory(cluster_config): ssh_key=request.config.getoption("key_path"), ) factory.create_cluster(cluster) - return cluster + return cluster, factory yield _cluster_factory factory.destroy_all_clusters() diff --git a/tests/integration-tests/tests/schedulers/test_awsbatch.py b/tests/integration-tests/tests/schedulers/test_awsbatch.py index 17b0ad6e88..ce2608eebb 100644 --- a/tests/integration-tests/tests/schedulers/test_awsbatch.py +++ b/tests/integration-tests/tests/schedulers/test_awsbatch.py @@ -29,7 +29,7 @@ def test_awsbatch(pcluster_config_reader, clusters_factory, test_datadir): Grouped all tests in a single function so that cluster can be reused for all of them. """ cluster_config = pcluster_config_reader() - cluster = clusters_factory(cluster_config) + cluster, _ = clusters_factory(cluster_config) remote_command_executor = RemoteCommandExecutor(cluster) _test_simple_job_submission(remote_command_executor, test_datadir) diff --git a/tests/integration-tests/tests/schedulers/test_slurm.py b/tests/integration-tests/tests/schedulers/test_slurm.py index 2b773d2bf5..8125b411be 100644 --- a/tests/integration-tests/tests/schedulers/test_slurm.py +++ b/tests/integration-tests/tests/schedulers/test_slurm.py @@ -35,7 +35,7 @@ def test_slurm(region, pcluster_config_reader, clusters_factory): scaledown_idletime = 3 max_queue_size = 5 cluster_config = pcluster_config_reader(scaledown_idletime=scaledown_idletime, max_queue_size=max_queue_size) - cluster = clusters_factory(cluster_config) + cluster, _ = clusters_factory(cluster_config) remote_command_executor = RemoteCommandExecutor(cluster) _test_slurm_version(remote_command_executor) diff --git a/tests/integration-tests/tests/storage/test_ebs.py b/tests/integration-tests/tests/storage/test_ebs.py index 07d685ff82..c9c6f9b53b 100644 --- a/tests/integration-tests/tests/storage/test_ebs.py +++ b/tests/integration-tests/tests/storage/test_ebs.py @@ -26,7 +26,7 @@ def test_ebs_single(scheduler, pcluster_config_reader, clusters_factory): mount_dir = "ebs_mount_dir" cluster_config = pcluster_config_reader(mount_dir=mount_dir) - cluster = clusters_factory(cluster_config) + cluster, _ = clusters_factory(cluster_config) remote_command_executor = RemoteCommandExecutor(cluster) mount_dir = "/" + mount_dir @@ -43,7 +43,7 @@ def test_ebs_multiple(scheduler, pcluster_config_reader, clusters_factory): mount_dirs = ["/ebs_mount_dir_{0}".format(i) for i in range(0, 5)] volume_sizes = [15 + 5 * i for i in range(0, 5)] cluster_config = pcluster_config_reader(mount_dirs=mount_dirs, volume_sizes=volume_sizes) - cluster = clusters_factory(cluster_config) + cluster, _ = clusters_factory(cluster_config) remote_command_executor = RemoteCommandExecutor(cluster) scheduler_commands = get_scheduler_commands(scheduler, remote_command_executor) @@ -58,7 +58,7 @@ def test_ebs_multiple(scheduler, pcluster_config_reader, clusters_factory): @pytest.mark.usefixtures("region", "os", "instance") def test_default_ebs(scheduler, pcluster_config_reader, clusters_factory): cluster_config = pcluster_config_reader() - cluster = clusters_factory(cluster_config) + cluster, _ = clusters_factory(cluster_config) remote_command_executor = RemoteCommandExecutor(cluster) mount_dir = "/shared" diff --git a/tests/integration-tests/tests/storage/test_fsx_lustre.py b/tests/integration-tests/tests/storage/test_fsx_lustre.py index f9f3ce2805..7560c2a933 100644 --- a/tests/integration-tests/tests/storage/test_fsx_lustre.py +++ b/tests/integration-tests/tests/storage/test_fsx_lustre.py @@ -35,7 +35,7 @@ def test_fsx_lustre(region, pcluster_config_reader, clusters_factory, s3_bucket_ bucket = boto3.resource("s3", region_name=region).Bucket(bucket_name) bucket.upload_file(str(test_datadir / "s3_test_file"), "s3_test_file") cluster_config = pcluster_config_reader(bucket_name=bucket_name, mount_dir=mount_dir) - cluster = clusters_factory(cluster_config) + cluster, _ = clusters_factory(cluster_config) remote_command_executor = RemoteCommandExecutor(cluster) _test_fsx_lustre_correctly_mounted(remote_command_executor, mount_dir) diff --git a/tests/integration-tests/tests/storage/test_raid.py b/tests/integration-tests/tests/storage/test_raid.py index f159be2306..70a9bdba21 100644 --- a/tests/integration-tests/tests/storage/test_raid.py +++ b/tests/integration-tests/tests/storage/test_raid.py @@ -25,7 +25,7 @@ @pytest.mark.usefixtures("region", "os", "instance") def test_raid_performance_mode(scheduler, pcluster_config_reader, clusters_factory): cluster_config = pcluster_config_reader() - cluster = clusters_factory(cluster_config) + cluster, _ = clusters_factory(cluster_config) remote_command_executor = RemoteCommandExecutor(cluster) scheduler_commands = get_scheduler_commands(scheduler, remote_command_executor) @@ -41,7 +41,7 @@ def test_raid_performance_mode(scheduler, pcluster_config_reader, clusters_facto @pytest.mark.usefixtures("region", "os", "instance") def test_raid_fault_tolerance_mode(scheduler, pcluster_config_reader, clusters_factory): cluster_config = pcluster_config_reader() - cluster = clusters_factory(cluster_config) + cluster, _ = clusters_factory(cluster_config) remote_command_executor = RemoteCommandExecutor(cluster) scheduler_commands = get_scheduler_commands(scheduler, remote_command_executor) diff --git a/tests/integration-tests/tests/test_scaling.py b/tests/integration-tests/tests/test_scaling.py index 24e4d2d218..650f8aed8c 100644 --- a/tests/integration-tests/tests/test_scaling.py +++ b/tests/integration-tests/tests/test_scaling.py @@ -29,7 +29,7 @@ def test_multiple_jobs_submission(scheduler, region, pcluster_config_reader, clu max_jobs_execution_time = 9 cluster_config = pcluster_config_reader(scaledown_idletime=scaledown_idletime) - cluster = clusters_factory(cluster_config) + cluster, _ = clusters_factory(cluster_config) remote_command_executor = RemoteCommandExecutor(cluster) scheduler_commands = get_scheduler_commands(scheduler, remote_command_executor) From 78d9da3cab657a459ef80f63c00538a2440d973c Mon Sep 17 00:00:00 2001 From: Enrico Usai Date: Wed, 3 Apr 2019 12:17:35 +0200 Subject: [PATCH 006/121] Integ Tests: add get_max_asg_capacity function Signed-off-by: Enrico Usai --- .../tests/common/scaling_common.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/tests/integration-tests/tests/common/scaling_common.py b/tests/integration-tests/tests/common/scaling_common.py index 6f0da610e6..77e37e82d7 100644 --- a/tests/integration-tests/tests/common/scaling_common.py +++ b/tests/integration-tests/tests/common/scaling_common.py @@ -71,10 +71,20 @@ def _watch_compute_nodes_allocation(): return asg_capacity_time_series, compute_nodes_time_series, timestamps -def _get_desired_asg_capacity(region, stack_name): - """Retrieve the desired capacity of the autoscaling group for a specific cluster.""" +def _get_asg(region, stack_name): + """Retrieve the autoscaling group for a specific cluster.""" asg_conn = boto3.client("autoscaling", region_name=region) tags = asg_conn.describe_tags(Filters=[{"Name": "value", "Values": [stack_name]}]) asg_name = tags.get("Tags")[0].get("ResourceId") response = asg_conn.describe_auto_scaling_groups(AutoScalingGroupNames=[asg_name]) - return response["AutoScalingGroups"][0]["DesiredCapacity"] + return response["AutoScalingGroups"][0] + + +def _get_desired_asg_capacity(region, stack_name): + """Retrieve the desired capacity of the autoscaling group for a specific cluster.""" + return _get_asg(region, stack_name)["DesiredCapacity"] + + +def get_max_asg_capacity(region, stack_name): + """Retrieve the max capacity of the autoscaling group for a specific cluster.""" + return _get_asg(region, stack_name)["MaxSize"] From c209a201c2d42e0f0b20fbf506e2bbb8fc527bf4 Mon Sep 17 00:00:00 2001 From: Enrico Usai Date: Wed, 3 Apr 2019 13:03:27 +0200 Subject: [PATCH 007/121] Integ Tests: add first test for the pcluster update command The test tries to change the max_queue and verifies the value after the update command. Signed-off-by: Enrico Usai --- tests/integration-tests/clusters_factory.py | 32 +++++++++++ .../tests/update/__init__.py | 11 ++++ .../tests/update/test_update.py | 54 +++++++++++++++++++ .../test_update/pcluster.config.ini | 24 +++++++++ 4 files changed, 121 insertions(+) create mode 100644 tests/integration-tests/tests/update/__init__.py create mode 100644 tests/integration-tests/tests/update/test_update.py create mode 100644 tests/integration-tests/tests/update/test_update/test_update/pcluster.config.ini diff --git a/tests/integration-tests/clusters_factory.py b/tests/integration-tests/clusters_factory.py index 031ed28cc0..83ea213c2e 100644 --- a/tests/integration-tests/clusters_factory.py +++ b/tests/integration-tests/clusters_factory.py @@ -30,6 +30,11 @@ def __init__(self, name, config_file, ssh_key): self.__cfn_outputs = None self.__cfn_resources = None + def update(self): + """Rewrite the configuration file starting from the self.config object.""" + with open(self.config_file, "w") as configfile: + self.config.write(configfile) + @property def cfn_name(self): """Return the name of the CloudFormation stack associated to the cluster.""" @@ -108,6 +113,33 @@ def create_cluster(self, cluster): logging.info("Sleeping for 60 seconds in case cluster is not ready yet") time.sleep(60) + def update_cluster(self, cluster, reset_desired=False, extra_params=None): + """ + Create a cluster with a given config. + :param cluster: cluster to update. + :param reset_desired: reset the current ASG desired capacity to initial config values + :param extra_params: extra parameters to pass to stack update + """ + name = cluster.name + config = cluster.config_file + + # update the cluster + logging.info("Updating cluster {0} with config {1}".format(name, config)) + self.__created_clusters[name] = cluster + + command = ["pcluster", "update", "--config", config] + if reset_desired: + command.append("--reset-desired") + if extra_params: + command.extend(["--extra-parameters", extra_params]) + command.append(name) + result = run_command(command) + if "Status: {0} - UPDATE_COMPLETE".format(cluster.cfn_name) not in result.stdout: + error = "Cluster update failed for {0} with output: {1}".format(name, result.stdout) + logging.error(error) + raise Exception(error) + logging.info("Cluster {0} updated successfully".format(name)) + @retry(stop_max_attempt_number=10, wait_fixed=5000, retry_on_exception=retry_if_subprocess_error) def destroy_cluster(self, name): """Destroy a created cluster.""" diff --git a/tests/integration-tests/tests/update/__init__.py b/tests/integration-tests/tests/update/__init__.py new file mode 100644 index 0000000000..2251b11f46 --- /dev/null +++ b/tests/integration-tests/tests/update/__init__.py @@ -0,0 +1,11 @@ +# Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). +# You may not use this file except in compliance with the License. +# A copy of the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "LICENSE.txt" file accompanying this file. +# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied. +# See the License for the specific language governing permissions and limitations under the License. diff --git a/tests/integration-tests/tests/update/test_update.py b/tests/integration-tests/tests/update/test_update.py new file mode 100644 index 0000000000..b52486966d --- /dev/null +++ b/tests/integration-tests/tests/update/test_update.py @@ -0,0 +1,54 @@ +# Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). +# You may not use this file except in compliance with the License. +# A copy of the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "LICENSE.txt" file accompanying this file. +# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied. +# See the License for the specific language governing permissions and limitations under the License. +import boto3 +import pytest +from assertpy import assert_that +from tests.common.scaling_common import get_max_asg_capacity + + +@pytest.mark.regions(["us-west-1"]) +@pytest.mark.schedulers(["slurm"]) +@pytest.mark.oss(["alinux"]) +@pytest.mark.usefixtures("os", "scheduler") +def test_update(region, pcluster_config_reader, clusters_factory): + """ + Test 'pcluster update' command. + + Grouped all tests in a single function so that cluster can be reused for all of them. + """ + max_queue_size = 5 + compute_instance = "c5.xlarge" + + cluster_config = pcluster_config_reader(max_queue_size=max_queue_size, compute_instance=compute_instance) + cluster, factory = clusters_factory(cluster_config) + _test_asg_size(region, cluster.cfn_name, max_queue_size) + + _test_update_max_queue(region, cluster, factory) + + +def _test_update_max_queue(region, cluster, factory): + new_queue_size = 10 + _update_cluster_property(cluster, "max_queue_size", str(new_queue_size)) + + factory.update_cluster(cluster) + _test_asg_size(region, cluster.cfn_name, new_queue_size) + + +def _test_asg_size(region, stack_name, queue_size): + asg_max_size = get_max_asg_capacity(region, stack_name) + assert_that(asg_max_size).is_equal_to(queue_size) + + +def _update_cluster_property(cluster, property_name, property_value): + cluster.config.set("cluster default", property_name, property_value) + # update configuration file + cluster.update() diff --git a/tests/integration-tests/tests/update/test_update/test_update/pcluster.config.ini b/tests/integration-tests/tests/update/test_update/test_update/pcluster.config.ini new file mode 100644 index 0000000000..72d06fd46e --- /dev/null +++ b/tests/integration-tests/tests/update/test_update/test_update/pcluster.config.ini @@ -0,0 +1,24 @@ +[global] +cluster_template = default + +[aws] +aws_region_name = {{ region }} + +[cluster default] +base_os = {{ os }} +key_name = {{ key_name }} +vpc_settings = parallelcluster-vpc +scheduler = slurm +master_instance_type = c5.xlarge +compute_instance_type = {{ compute_instance }} +initial_queue_size = 1 +max_queue_size = {{ max_queue_size }} +maintain_initial_size = true +scaling_settings = custom + +[scaling custom] +scaledown_idletime = 3 + +[vpc parallelcluster-vpc] +vpc_id = {{ vpc_id }} +master_subnet_id = {{ public_subnet_id }} From b37cf8db4255eae16db38b090d39fe0cb48c29a2 Mon Sep 17 00:00:00 2001 From: Enrico Usai Date: Wed, 3 Apr 2019 18:12:18 +0200 Subject: [PATCH 008/121] Integ Tests: extend update test by changing the compute instance type Signed-off-by: Enrico Usai --- .../tests/common/scaling_common.py | 33 ++++++++++ .../tests/update/test_update.py | 62 +++++++++++++++---- 2 files changed, 83 insertions(+), 12 deletions(-) diff --git a/tests/integration-tests/tests/common/scaling_common.py b/tests/integration-tests/tests/common/scaling_common.py index 77e37e82d7..4edc0e8c76 100644 --- a/tests/integration-tests/tests/common/scaling_common.py +++ b/tests/integration-tests/tests/common/scaling_common.py @@ -71,6 +71,39 @@ def _watch_compute_nodes_allocation(): return asg_capacity_time_series, compute_nodes_time_series, timestamps +def watch_compute_nodes(scheduler_commands, max_monitoring_time, number_of_nodes): + """Watch periodically the number of nodes seen by the scheduler.""" + compute_nodes_time_series = [] + timestamps = [] + + @retry( + # Retry until the given number_of_nodes is equal to the number of compute nodes + retry_on_result=lambda _: compute_nodes_time_series[-1] != number_of_nodes, + wait_fixed=seconds(20), + stop_max_delay=max_monitoring_time, + ) + def _watch_compute_nodes_allocation(): + compute_nodes = scheduler_commands.compute_nodes_count() + timestamp = time.time() + + # add values only if there is a transition. + if len(compute_nodes_time_series) == 0 or compute_nodes_time_series[-1] != compute_nodes: + compute_nodes_time_series.append(compute_nodes) + timestamps.append(timestamp) + + try: + _watch_compute_nodes_allocation() + except RetryError: + # ignoring this error in order to perform assertions on the collected data. + pass + + logging.info( + "Monitoring completed: %s, %s", + "compute_nodes_time_series [" + " ".join(map(str, compute_nodes_time_series)) + "]", + "timestamps [" + " ".join(map(str, timestamps)) + "]", + ) + + def _get_asg(region, stack_name): """Retrieve the autoscaling group for a specific cluster.""" asg_conn = boto3.client("autoscaling", region_name=region) diff --git a/tests/integration-tests/tests/update/test_update.py b/tests/integration-tests/tests/update/test_update.py index b52486966d..bd24d78de3 100644 --- a/tests/integration-tests/tests/update/test_update.py +++ b/tests/integration-tests/tests/update/test_update.py @@ -11,8 +11,12 @@ # See the License for the specific language governing permissions and limitations under the License. import boto3 import pytest + from assertpy import assert_that -from tests.common.scaling_common import get_max_asg_capacity +from remote_command_executor import RemoteCommandExecutor +from tests.common.scaling_common import get_max_asg_capacity, watch_compute_nodes +from tests.common.schedulers_common import SlurmCommands +from time_utils import minutes @pytest.mark.regions(["us-west-1"]) @@ -30,25 +34,59 @@ def test_update(region, pcluster_config_reader, clusters_factory): cluster_config = pcluster_config_reader(max_queue_size=max_queue_size, compute_instance=compute_instance) cluster, factory = clusters_factory(cluster_config) - _test_asg_size(region, cluster.cfn_name, max_queue_size) - - _test_update_max_queue(region, cluster, factory) + # Verify initial settings + _test_max_queue(region, cluster.cfn_name, max_queue_size) + _test_compute_instance_type(cluster.cfn_name, compute_instance) + # Configuration parameters for the update test + new_max_queue_size = 10 + new_compute_instance = "c4.xlarge" -def _test_update_max_queue(region, cluster, factory): - new_queue_size = 10 - _update_cluster_property(cluster, "max_queue_size", str(new_queue_size)) - + # change config settings + _update_cluster_property(cluster, "max_queue_size", str(new_max_queue_size)) + _update_cluster_property(cluster, "compute_instance_type", new_compute_instance) + # update configuration file + cluster.update() + # update cluster factory.update_cluster(cluster) - _test_asg_size(region, cluster.cfn_name, new_queue_size) + + # test update + _test_max_queue(region, cluster.cfn_name, new_max_queue_size) + _test_update_compute_instance_type(cluster, new_compute_instance) -def _test_asg_size(region, stack_name, queue_size): +def _test_max_queue(region, stack_name, queue_size): asg_max_size = get_max_asg_capacity(region, stack_name) assert_that(asg_max_size).is_equal_to(queue_size) +def _test_update_compute_instance_type(cluster, new_compute_instance): + # submit a job to perform a scaling up action and have a new instance + number_of_nodes = 2 + remote_command_executor = RemoteCommandExecutor(cluster) + slurm_commands = SlurmCommands(remote_command_executor) + result = slurm_commands.submit_command("sleep 60", nodes=number_of_nodes) + slurm_commands.assert_job_submitted(result.stdout) + + estimated_scaleup_time = 5 + watch_compute_nodes( + scheduler_commands=slurm_commands, + max_monitoring_time=minutes(estimated_scaleup_time), + number_of_nodes=number_of_nodes, + ) + _test_compute_instance_type(cluster.cfn_name, new_compute_instance) + + +def _test_compute_instance_type(stack_name, compute_instance_type): + ec2_client = boto3.resource("ec2") + instance_ids = [] + instance_types = [] + for instance in ec2_client.instances.filter(Filters=[{"Name": "tag:Application", "Values": [stack_name]}]): + instance_ids.append(instance.instance_id) + instance_types.append(instance.instance_type) + + assert_that(instance_types).contains(compute_instance_type) + + def _update_cluster_property(cluster, property_name, property_value): cluster.config.set("cluster default", property_name, property_value) - # update configuration file - cluster.update() From f3c5f000113cb89b5e5d464f63f4cbc7b06679c5 Mon Sep 17 00:00:00 2001 From: Enrico Usai Date: Fri, 5 Apr 2019 15:49:19 +0200 Subject: [PATCH 009/121] Integ Tests: refactor update test to have more atomic functions Signed-off-by: Enrico Usai --- .../tests/update/test_update.py | 47 ++++++++++++------- 1 file changed, 30 insertions(+), 17 deletions(-) diff --git a/tests/integration-tests/tests/update/test_update.py b/tests/integration-tests/tests/update/test_update.py index bd24d78de3..521292e874 100644 --- a/tests/integration-tests/tests/update/test_update.py +++ b/tests/integration-tests/tests/update/test_update.py @@ -9,6 +9,8 @@ # or in the "LICENSE.txt" file accompanying this file. # This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied. # See the License for the specific language governing permissions and limitations under the License. +from collections import namedtuple + import boto3 import pytest @@ -18,6 +20,8 @@ from tests.common.schedulers_common import SlurmCommands from time_utils import minutes +PclusterConfig = namedtuple("PclusterConfig", ["max_queue_size", "compute_instance"]) + @pytest.mark.regions(["us-west-1"]) @pytest.mark.schedulers(["slurm"]) @@ -29,30 +33,43 @@ def test_update(region, pcluster_config_reader, clusters_factory): Grouped all tests in a single function so that cluster can be reused for all of them. """ - max_queue_size = 5 - compute_instance = "c5.xlarge" + init_config = PclusterConfig(max_queue_size=5, compute_instance="c5.xlarge") + cluster, factory = _init_cluster(region, clusters_factory, pcluster_config_reader, init_config) + + updated_config = PclusterConfig(max_queue_size=10, compute_instance="c4.xlarge") + _update_cluster(cluster, factory, updated_config) + + # test update + _test_max_queue(region, cluster.cfn_name, updated_config.max_queue_size) + _test_update_compute_instance_type(cluster, updated_config.compute_instance) - cluster_config = pcluster_config_reader(max_queue_size=max_queue_size, compute_instance=compute_instance) + +def _init_cluster(region, clusters_factory, pcluster_config_reader, config): + # read configuration and create cluster + cluster_config = pcluster_config_reader( + max_queue_size=config.max_queue_size, compute_instance=config.compute_instance + ) cluster, factory = clusters_factory(cluster_config) + # Verify initial settings - _test_max_queue(region, cluster.cfn_name, max_queue_size) - _test_compute_instance_type(cluster.cfn_name, compute_instance) + _test_max_queue(region, cluster.cfn_name, config.max_queue_size) + _test_compute_instance_type(cluster.cfn_name, config.compute_instance) + + return cluster, factory - # Configuration parameters for the update test - new_max_queue_size = 10 - new_compute_instance = "c4.xlarge" +def _update_cluster(cluster, factory, config): # change config settings - _update_cluster_property(cluster, "max_queue_size", str(new_max_queue_size)) - _update_cluster_property(cluster, "compute_instance_type", new_compute_instance) + _update_cluster_property(cluster, "max_queue_size", str(config.max_queue_size)) + _update_cluster_property(cluster, "compute_instance_type", config.compute_instance) # update configuration file cluster.update() # update cluster factory.update_cluster(cluster) - # test update - _test_max_queue(region, cluster.cfn_name, new_max_queue_size) - _test_update_compute_instance_type(cluster, new_compute_instance) + +def _update_cluster_property(cluster, property_name, property_value): + cluster.config.set("cluster default", property_name, property_value) def _test_max_queue(region, stack_name, queue_size): @@ -86,7 +103,3 @@ def _test_compute_instance_type(stack_name, compute_instance_type): instance_types.append(instance.instance_type) assert_that(instance_types).contains(compute_instance_type) - - -def _update_cluster_property(cluster, property_name, property_value): - cluster.config.set("cluster default", property_name, property_value) From caf90fea15428d99516c48775ab6c60576f22e6a Mon Sep 17 00:00:00 2001 From: Enrico Usai Date: Fri, 5 Apr 2019 16:36:20 +0200 Subject: [PATCH 010/121] cfn-lint: Restore iE3008 since the issue has been resolved They just released the 1.18.1 version that fixes the https://github.com/awslabs/cfn-python-lint/issues/564 Signed-off-by: Enrico Usai --- cli/tox.ini | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/cli/tox.ini b/cli/tox.ini index a247c9b3a9..565059f727 100644 --- a/cli/tox.ini +++ b/cli/tox.ini @@ -209,10 +209,9 @@ deps = cfn-lint # E2504 disabled since does not allow two-digit numbers in ephemeral(n) # W2507 disabled since we want to have nullable String type parameters # E2523 disabled since we have both a Launch Template and Launch Configuration -# iE3008 disabled because of https://github.com/awslabs/cfn-python-lint/issues/564 commands = cfn-lint -iE2504 -iW2507 -iE2523 aws-parallelcluster.cfn.json - cfn-lint -iE3008 batch-substack.cfn.json + cfn-lint batch-substack.cfn.json cfn-lint ebs-substack.cfn.json cfn-lint efs-substack.cfn.json cfn-lint raid-substack.cfn.json From 42791891099a16af3c46878aa7fcddec6f30eb06 Mon Sep 17 00:00:00 2001 From: Enrico Usai Date: Mon, 8 Apr 2019 10:26:01 +0200 Subject: [PATCH 011/121] Integ Tests: Add missing region fixture to update test It was failing with: botocore.exceptions.NoRegionError: You must specify a region. Signed-off-by: Enrico Usai --- tests/integration-tests/tests/update/test_update.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration-tests/tests/update/test_update.py b/tests/integration-tests/tests/update/test_update.py index 521292e874..fb2cbadced 100644 --- a/tests/integration-tests/tests/update/test_update.py +++ b/tests/integration-tests/tests/update/test_update.py @@ -26,7 +26,7 @@ @pytest.mark.regions(["us-west-1"]) @pytest.mark.schedulers(["slurm"]) @pytest.mark.oss(["alinux"]) -@pytest.mark.usefixtures("os", "scheduler") +@pytest.mark.usefixtures("region", "os", "scheduler") def test_update(region, pcluster_config_reader, clusters_factory): """ Test 'pcluster update' command. From 183a4ed86022e318e28662e0acf7246bf744af3a Mon Sep 17 00:00:00 2001 From: Sean Smith Date: Wed, 3 Apr 2019 16:46:18 -0700 Subject: [PATCH 012/121] Display Public IP's for SSH The configuration option `use_public_ips = true`, is used to toggle allocation of elastic ip's. Many customers don't want to have to use an elastic ip, but they still want too ssh into their instance. This patch allows users to `pcluster ssh cluster` when `use_public_ips = false`. Signed-off-by: Sean Smith --- cloudformation/aws-parallelcluster.cfn.json | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/cloudformation/aws-parallelcluster.cfn.json b/cloudformation/aws-parallelcluster.cfn.json index aae3381aba..9c3f51b2ae 100644 --- a/cloudformation/aws-parallelcluster.cfn.json +++ b/cloudformation/aws-parallelcluster.cfn.json @@ -4753,8 +4753,7 @@ "MasterServer", "PublicIp" ] - }, - "Condition": "MasterPublicIp" + } }, "GangliaPrivateURL": { "Description": "Private URL to access Ganglia (disabled by default)", @@ -4790,8 +4789,7 @@ "/ganglia/" ] ] - }, - "Condition": "MasterPublicIp" + } }, "ResourcesS3Bucket": { "Description": "S3 user bucket where AWS ParallelCluster resources are stored", From 6170e20d765c7366fbe22c75cf965d6756d16b79 Mon Sep 17 00:00:00 2001 From: Enrico Usai Date: Thu, 11 Apr 2019 11:08:26 +0200 Subject: [PATCH 013/121] Doc: Improve pre/post-install script usage + document how to identify master/compute in pre/post-install scripts + extend the example to use the post_install_args parameter + add a paragraph to specify where to find the output Signed-off-by: Enrico Usai --- docs/pre_post_install.rst | 79 +++++++++++++++++++++++++++++++-------- 1 file changed, 63 insertions(+), 16 deletions(-) diff --git a/docs/pre_post_install.rst b/docs/pre_post_install.rst index bb31165822..313af5dcf5 100644 --- a/docs/pre_post_install.rst +++ b/docs/pre_post_install.rst @@ -7,19 +7,39 @@ AWS ParallelCluster can execute arbitrary code either before(pre) or after(post) cluster creation. This code is typically stored in S3 and accessed via HTTP(S) during cluster creation. The code will be executed as root and can be in any script language supported by the cluster OS, typically `bash` or `python`. -pre-install actions are called before any cluster deployment bootstrap such as configuring NAT, EBS and the scheduler. +Pre-install actions are called before any cluster deployment bootstrap such as configuring NAT, EBS and the scheduler. Typical pre-install actions may include modifying storage, adding extra users or packages. -post-install actions are called after cluster bootstrap is complete, as the last action before an instance is +Post-install actions are called after cluster bootstrap is complete, as the last action before an instance is considered complete. Typical post-install actions may include changing scheduler settings, modifying storage or packages. -Arguments can be passed to scripts by specifying them in the config. These will be passed double-quoted to the -pre/post-install actions. +Arguments can be passed to scripts by specifying them in the config. If a pre/post-install actions fails, then the instance bootstrap will be considered failed and it will not continue. Success is signalled with an exit code of 0, any other exit code will be considered a fail. +It is possible to differentiate between master and compute nodes execution by sourcing +the ``/etc/parallelcluster/cfnconfig`` file and evaluating the ``cfn_node_type`` environment variable, +whose possible values are ``MasterServer`` and ``ComputeFleet`` for the master and compute node respectively. + +:: + + #!/bin/bash + + . "/etc/parallelcluster/cfnconfig" + + case "${cfn_node_type}" in + MasterServer) + echo "I am the master" >> /tmp/master.txt + ;; + ComputeFleet) + echo "I am a compute node" >> /tmp/compute.txt + ;; + *) + ;; + esac + Configuration ------------- @@ -28,22 +48,19 @@ are not required for basic cluster install. :: - # URL to a preinstall script. This is executed before any of the boot_as_* scripts are run - # (defaults to NONE) + # URL to a preinstall script. This is executed before any of the boot_as_* scripts are run (defaults to NONE) pre_install = NONE - # Arguments to be passed to preinstall script - # (defaults to NONE) + # Arguments to be passed to preinstall script (defaults to NONE) pre_install_args = NONE - # URL to a postinstall script. This is executed after any of the boot_as_* scripts are run - # (defaults to NONE) + # URL to a postinstall script. This is executed after any of the boot_as_* scripts are run (defaults to NONE) post_install = NONE - # Arguments to be passed to postinstall script - # (defaults to NONE) + # Arguments to be passed to postinstall script (defaults to NONE) post_install_args = NONE Arguments --------- The first two arguments ``$0`` and ``$1`` are reserved for the script name and url. +If the pre/post_install_args variable contains a list of parameters it must be double quoted. See example below. :: @@ -51,18 +68,30 @@ The first two arguments ``$0`` and ``$1`` are reserved for the script name and u $1 => s3 url $n => args set by pre/post_install_args +Output +------ +The output of the pre/post-install scripts can be found in the ``/var/log/cfn-init.log`` +and ``/var/log/cfn-init-cmd.log`` files. + Example ------- -The following are some steps to create a simple post install script that installs the R packages in a cluster. +The following are some steps to create a simple post install script that installs a list of packages, specified by the +``post_install_args`` configuration parameter, in a cluster. -1. Create a script. For the R example, see below +1. Create a script :: #!/bin/bash - yum -y install --enablerepo=epel R + echo "post-install script has $# arguments" + for arg in "$@" + do + echo "arg: ${arg}" + done + + yum -y install "${@:2}" 2. Upload the script with the correct permissions to S3 @@ -75,6 +104,7 @@ The following are some steps to create a simple post install script that install [cluster default] ... post_install = https://.s3.amazonaws.com/myscript.sh + post_install_args = "R curl wget" If the bucket does not have public-read permission use ``s3`` as URL scheme. @@ -83,8 +113,25 @@ If the bucket does not have public-read permission use ``s3`` as URL scheme. [cluster default] ... post_install = s3:///myscript.sh - + post_install_args = "R curl wget" 4. Launch a cluster ``pcluster create mycluster`` + + +5. Verify the output + +:: + + $ less /var/log/cfn-init.log + 2019-04-11 10:43:54,588 [DEBUG] Command runpostinstall output: post-install script has 4 arguments + arg: s3://eu-eu-west-1/test.sh + arg: R + arg: curl + arg: wget + Loaded plugins: dkms-build-requires, priorities, update-motd, upgrade-helper + Package R-3.4.1-1.52.amzn1.x86_64 already installed and latest version + Package curl-7.61.1-7.91.amzn1.x86_64 already installed and latest version + Package wget-1.18-4.29.amzn1.x86_64 already installed and latest version + Nothing to do From 02006256587e2534122448a280cf043241689608 Mon Sep 17 00:00:00 2001 From: Francesco De Martino Date: Fri, 12 Apr 2019 15:45:04 +0200 Subject: [PATCH 014/121] Revert "Double the estimated scaledown time" This reverts commit dc1b5926600f41dee5a1a36a9f5a6d8cb72211a1. Signed-off-by: Francesco De Martino --- tests/integration-tests/tests/schedulers/test_slurm.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/integration-tests/tests/schedulers/test_slurm.py b/tests/integration-tests/tests/schedulers/test_slurm.py index 8125b411be..d1bbb8c8e0 100644 --- a/tests/integration-tests/tests/schedulers/test_slurm.py +++ b/tests/integration-tests/tests/schedulers/test_slurm.py @@ -100,7 +100,6 @@ def _test_job_dependencies(remote_command_executor, region, stack_name, scaledow jobs_execution_time = 1 estimated_scaleup_time = 5 - estimated_scaledown_time = 20 asg_capacity_time_series, compute_nodes_time_series, timestamps = get_compute_nodes_allocation( scheduler_commands=slurm_commands, region=region, @@ -108,7 +107,7 @@ def _test_job_dependencies(remote_command_executor, region, stack_name, scaledow max_monitoring_time=minutes(jobs_execution_time) + minutes(scaledown_idletime) + minutes(estimated_scaleup_time) - + minutes(estimated_scaledown_time), + + minutes(10), ) assert_that(max(asg_capacity_time_series)).is_equal_to(1) assert_that(max(compute_nodes_time_series)).is_equal_to(1) From 3193b5788b608d2760f7bc92092693c613f73b04 Mon Sep 17 00:00:00 2001 From: Francesco De Martino Date: Fri, 12 Apr 2019 15:48:34 +0200 Subject: [PATCH 015/121] Integ tests: use same parametrized instance type for master and compute node Signed-off-by: Francesco De Martino --- .../schedulers/test_awsbatch/test_awsbatch/pcluster.config.ini | 1 + .../tests/storage/test_ebs/test_default_ebs/pcluster.config.ini | 1 + .../tests/storage/test_ebs/test_ebs_multiple/pcluster.config.ini | 1 + .../tests/storage/test_ebs/test_ebs_single/pcluster.config.ini | 1 + .../storage/test_fsx_lustre/test_fsx_lustre/pcluster.config.ini | 1 + .../test_raid/test_raid_fault_tolerance_mode/pcluster.config.ini | 1 + .../test_raid/test_raid_performance_mode/pcluster.config.ini | 1 + 7 files changed, 7 insertions(+) diff --git a/tests/integration-tests/tests/schedulers/test_awsbatch/test_awsbatch/pcluster.config.ini b/tests/integration-tests/tests/schedulers/test_awsbatch/test_awsbatch/pcluster.config.ini index 360de75fd0..bd9a0d9d3b 100644 --- a/tests/integration-tests/tests/schedulers/test_awsbatch/test_awsbatch/pcluster.config.ini +++ b/tests/integration-tests/tests/schedulers/test_awsbatch/test_awsbatch/pcluster.config.ini @@ -9,6 +9,7 @@ base_os = {{ os }} key_name = {{ key_name }} vpc_settings = parallelcluster-vpc scheduler = awsbatch +master_instance_type = {{ instance }} compute_instance_type = {{ instance }} min_vcpus = 2 desired_vcpus = 2 diff --git a/tests/integration-tests/tests/storage/test_ebs/test_default_ebs/pcluster.config.ini b/tests/integration-tests/tests/storage/test_ebs/test_default_ebs/pcluster.config.ini index 5732d70e6c..95dce375f3 100644 --- a/tests/integration-tests/tests/storage/test_ebs/test_default_ebs/pcluster.config.ini +++ b/tests/integration-tests/tests/storage/test_ebs/test_default_ebs/pcluster.config.ini @@ -9,6 +9,7 @@ base_os = {{ os }} key_name = {{ key_name }} vpc_settings = parallelcluster-vpc scheduler = {{ scheduler }} +master_instance_type = {{ instance }} compute_instance_type = {{ instance }} {% if scheduler == "awsbatch" %} min_vcpus = 4 diff --git a/tests/integration-tests/tests/storage/test_ebs/test_ebs_multiple/pcluster.config.ini b/tests/integration-tests/tests/storage/test_ebs/test_ebs_multiple/pcluster.config.ini index 6aea380572..ac36864f10 100644 --- a/tests/integration-tests/tests/storage/test_ebs/test_ebs_multiple/pcluster.config.ini +++ b/tests/integration-tests/tests/storage/test_ebs/test_ebs_multiple/pcluster.config.ini @@ -9,6 +9,7 @@ base_os = {{ os }} key_name = {{ key_name }} vpc_settings = parallelcluster-vpc scheduler = {{ scheduler }} +master_instance_type = {{ instance }} compute_instance_type = {{ instance }} {% if scheduler == "awsbatch" %} min_vcpus = 4 diff --git a/tests/integration-tests/tests/storage/test_ebs/test_ebs_single/pcluster.config.ini b/tests/integration-tests/tests/storage/test_ebs/test_ebs_single/pcluster.config.ini index 811370854f..70c62e03f4 100644 --- a/tests/integration-tests/tests/storage/test_ebs/test_ebs_single/pcluster.config.ini +++ b/tests/integration-tests/tests/storage/test_ebs/test_ebs_single/pcluster.config.ini @@ -9,6 +9,7 @@ base_os = {{ os }} key_name = {{ key_name }} vpc_settings = parallelcluster-vpc scheduler = {{ scheduler }} +master_instance_type = {{ instance }} compute_instance_type = {{ instance }} {% if scheduler == "awsbatch" %} min_vcpus = 4 diff --git a/tests/integration-tests/tests/storage/test_fsx_lustre/test_fsx_lustre/pcluster.config.ini b/tests/integration-tests/tests/storage/test_fsx_lustre/test_fsx_lustre/pcluster.config.ini index bef59dc140..e12910c073 100644 --- a/tests/integration-tests/tests/storage/test_fsx_lustre/test_fsx_lustre/pcluster.config.ini +++ b/tests/integration-tests/tests/storage/test_fsx_lustre/test_fsx_lustre/pcluster.config.ini @@ -9,6 +9,7 @@ base_os = {{ os }} key_name = {{ key_name }} vpc_settings = parallelcluster-vpc scheduler = {{ scheduler }} +master_instance_type = {{ instance }} compute_instance_type = {{ instance }} {% if scheduler == "awsbatch" %} min_vcpus = 4 diff --git a/tests/integration-tests/tests/storage/test_raid/test_raid_fault_tolerance_mode/pcluster.config.ini b/tests/integration-tests/tests/storage/test_raid/test_raid_fault_tolerance_mode/pcluster.config.ini index 6f79aa858f..fd70cf4e0e 100644 --- a/tests/integration-tests/tests/storage/test_raid/test_raid_fault_tolerance_mode/pcluster.config.ini +++ b/tests/integration-tests/tests/storage/test_raid/test_raid_fault_tolerance_mode/pcluster.config.ini @@ -9,6 +9,7 @@ base_os = {{ os }} key_name = {{ key_name }} vpc_settings = parallelcluster-vpc scheduler = {{ scheduler }} +master_instance_type = {{ instance }} compute_instance_type = {{ instance }} {% if scheduler == "awsbatch" %} min_vcpus = 4 diff --git a/tests/integration-tests/tests/storage/test_raid/test_raid_performance_mode/pcluster.config.ini b/tests/integration-tests/tests/storage/test_raid/test_raid_performance_mode/pcluster.config.ini index 3bba593301..f429ab5a35 100644 --- a/tests/integration-tests/tests/storage/test_raid/test_raid_performance_mode/pcluster.config.ini +++ b/tests/integration-tests/tests/storage/test_raid/test_raid_performance_mode/pcluster.config.ini @@ -9,6 +9,7 @@ base_os = {{ os }} key_name = {{ key_name }} vpc_settings = parallelcluster-vpc scheduler = {{ scheduler }} +master_instance_type = {{ instance }} compute_instance_type = {{ instance }} {% if scheduler == "awsbatch" %} min_vcpus = 4 From 51b6b7deee879761951c95be2c17bf0d3f7b7f70 Mon Sep 17 00:00:00 2001 From: Francesco De Martino Date: Fri, 12 Apr 2019 15:57:35 +0200 Subject: [PATCH 016/121] Integ tests: redistribute tests across regions Signed-off-by: Francesco De Martino --- tests/integration-tests/tests/schedulers/test_awsbatch.py | 2 +- tests/integration-tests/tests/storage/test_ebs.py | 6 +++--- tests/integration-tests/tests/storage/test_fsx_lustre.py | 2 +- tests/integration-tests/tests/storage/test_raid.py | 4 ++-- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/tests/integration-tests/tests/schedulers/test_awsbatch.py b/tests/integration-tests/tests/schedulers/test_awsbatch.py index ce2608eebb..9621aa6a29 100644 --- a/tests/integration-tests/tests/schedulers/test_awsbatch.py +++ b/tests/integration-tests/tests/schedulers/test_awsbatch.py @@ -18,7 +18,7 @@ from tests.common.schedulers_common import AWSBatchCommands -@pytest.mark.regions(["us-east-1", "eu-west-1"]) +@pytest.mark.regions(["eu-west-1"]) @pytest.mark.instances(["c5.xlarge", "t2.large"]) @pytest.mark.dimensions("*", "*", "alinux", "awsbatch") @pytest.mark.usefixtures("region", "os", "instance", "scheduler") diff --git a/tests/integration-tests/tests/storage/test_ebs.py b/tests/integration-tests/tests/storage/test_ebs.py index c9c6f9b53b..4ea84a0fde 100644 --- a/tests/integration-tests/tests/storage/test_ebs.py +++ b/tests/integration-tests/tests/storage/test_ebs.py @@ -19,7 +19,7 @@ from tests.storage.storage_common import verify_directory_correctly_shared -@pytest.mark.regions(["us-east-1", "eu-west-1", "cn-north-1", "us-gov-west-1"]) +@pytest.mark.regions(["us-west-2", "cn-north-1", "us-gov-west-1"]) @pytest.mark.instances(["c4.xlarge", "c5.xlarge"]) @pytest.mark.schedulers(["sge", "awsbatch"]) @pytest.mark.usefixtures("region", "os", "instance") @@ -35,7 +35,7 @@ def test_ebs_single(scheduler, pcluster_config_reader, clusters_factory): _test_ebs_correctly_shared(remote_command_executor, mount_dir, scheduler_commands) -@pytest.mark.regions(["us-east-1", "eu-west-1", "cn-north-1", "us-gov-west-1"]) +@pytest.mark.regions(["us-east-1", "cn-north-1", "us-gov-east-1"]) @pytest.mark.instances(["c5.xlarge"]) @pytest.mark.schedulers(["sge", "awsbatch"]) @pytest.mark.usefixtures("region", "os", "instance") @@ -52,7 +52,7 @@ def test_ebs_multiple(scheduler, pcluster_config_reader, clusters_factory): _test_ebs_correctly_shared(remote_command_executor, mount_dir, scheduler_commands) -@pytest.mark.regions(["us-east-1", "cn-north-1", "us-gov-west-1"]) +@pytest.mark.regions(["eu-west-2", "cn-northwest-1", "us-gov-west-1"]) @pytest.mark.instances(["c5.xlarge"]) @pytest.mark.schedulers(["sge", "awsbatch"]) @pytest.mark.usefixtures("region", "os", "instance") diff --git a/tests/integration-tests/tests/storage/test_fsx_lustre.py b/tests/integration-tests/tests/storage/test_fsx_lustre.py index 7560c2a933..67b0ba55b8 100644 --- a/tests/integration-tests/tests/storage/test_fsx_lustre.py +++ b/tests/integration-tests/tests/storage/test_fsx_lustre.py @@ -19,7 +19,7 @@ from tests.common.schedulers_common import SgeCommands -@pytest.mark.regions(["us-east-1", "eu-west-1"]) +@pytest.mark.regions(["eu-central-1"]) @pytest.mark.instances(["c5.xlarge"]) @pytest.mark.oss(["centos7", "alinux"]) @pytest.mark.schedulers(["sge"]) diff --git a/tests/integration-tests/tests/storage/test_raid.py b/tests/integration-tests/tests/storage/test_raid.py index 70a9bdba21..9d82f70bf3 100644 --- a/tests/integration-tests/tests/storage/test_raid.py +++ b/tests/integration-tests/tests/storage/test_raid.py @@ -19,7 +19,7 @@ from tests.storage.storage_common import verify_directory_correctly_shared -@pytest.mark.regions(["us-east-1", "eu-west-1", "cn-north-1", "us-gov-west-1"]) +@pytest.mark.regions(["ap-south-1", "cn-northwest-1", "us-gov-east-1"]) @pytest.mark.instances(["c5.xlarge"]) @pytest.mark.schedulers(["sge", "awsbatch"]) @pytest.mark.usefixtures("region", "os", "instance") @@ -35,7 +35,7 @@ def test_raid_performance_mode(scheduler, pcluster_config_reader, clusters_facto _test_raid_correctly_shared(remote_command_executor, mount_dir, scheduler_commands) -@pytest.mark.regions(["us-east-1", "eu-west-1", "cn-north-1", "us-gov-west-1"]) +@pytest.mark.regions(["us-east-2", "cn-north-1", "us-gov-west-1"]) @pytest.mark.instances(["c5.xlarge"]) @pytest.mark.schedulers(["sge", "awsbatch"]) @pytest.mark.usefixtures("region", "os", "instance") From 02f1c3f6130bfbf69553ddc25358b26c6cf94ede Mon Sep 17 00:00:00 2001 From: Francesco De Martino Date: Mon, 15 Apr 2019 12:27:55 +0200 Subject: [PATCH 017/121] Integ tests: add succeeded count to json report Signed-off-by: Francesco De Martino --- tests/integration-tests/reports_generator.py | 23 ++++++++++++++------ 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/tests/integration-tests/reports_generator.py b/tests/integration-tests/reports_generator.py index 89eb6d1bc7..c5f8eb89c3 100644 --- a/tests/integration-tests/reports_generator.py +++ b/tests/integration-tests/reports_generator.py @@ -58,6 +58,8 @@ def generate_json_report(test_results_dir): _record_results(results, root, "./testcase[error]/properties/property", "errors") _record_results(results, root, "./testcase/properties/property", "total") + _record_succeeded_results(results) + with open("{0}/test_report.json".format(test_results_dir), "w") as out_f: out_f.write(json.dumps(results, indent=4)) @@ -65,16 +67,23 @@ def generate_json_report(test_results_dir): def _record_results(results_dict, results_xml_root, xpath_exp, label): - for skipped in results_xml_root.findall(xpath_exp): - if not skipped.get("name") in results_dict: - results_dict[skipped.get("name")] = {} - if not skipped.get("value") in results_dict[skipped.get("name")]: - results_dict[skipped.get("name")].update({skipped.get("value"): _empty_results_dict()}) - results_dict[skipped.get("name")][skipped.get("value")][label] += 1 + for match in results_xml_root.findall(xpath_exp): + if not match.get("name") in results_dict: + results_dict[match.get("name")] = {} + if not match.get("value") in results_dict[match.get("name")]: + results_dict[match.get("name")].update({match.get("value"): _empty_results_dict()}) + results_dict[match.get("name")][match.get("value")][label] += 1 def _empty_results_dict(): return {"total": 0, "skipped": 0, "failures": 0, "errors": 0} -# generate_tabular_report("1549489575.329696.out", None, None, None, None) +def _record_succeeded_results(results): + results["all"]["succeeded"] = ( + results["all"]["total"] - results["all"]["skipped"] - results["all"]["failures"] - results["all"]["errors"] + ) + for dimension in results: + if dimension != "all": + for result in results[dimension].values(): + result["succeeded"] = result["total"] - result["skipped"] - result["failures"] - result["errors"] From 18cdb1c8f2f2a6f0065444c0063499dcb796c846 Mon Sep 17 00:00:00 2001 From: Francesco De Martino Date: Mon, 15 Apr 2019 12:30:02 +0200 Subject: [PATCH 018/121] Integ tests: verify compute nodes scale-down with Slurm Signed-off-by: Francesco De Martino --- .../integration-tests/tests/schedulers/test_slurm.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/tests/integration-tests/tests/schedulers/test_slurm.py b/tests/integration-tests/tests/schedulers/test_slurm.py index d1bbb8c8e0..45d35cbbd6 100644 --- a/tests/integration-tests/tests/schedulers/test_slurm.py +++ b/tests/integration-tests/tests/schedulers/test_slurm.py @@ -41,7 +41,7 @@ def test_slurm(region, pcluster_config_reader, clusters_factory): _test_slurm_version(remote_command_executor) _test_dynamic_max_cluster_size(remote_command_executor, region, cluster.asg) _test_cluster_limits(remote_command_executor, max_queue_size, region, cluster.asg) - _test_job_dependencies(remote_command_executor, region, cluster.cfn_name, scaledown_idletime) + _test_job_dependencies(remote_command_executor, region, cluster.cfn_name, scaledown_idletime, max_queue_size) _test_dynamic_dummy_nodes(remote_command_executor, max_queue_size) @@ -82,7 +82,7 @@ def _test_dynamic_dummy_nodes(remote_command_executor, max_queue_size): _assert_dummy_nodes(remote_command_executor, max_queue_size - 1) -def _test_job_dependencies(remote_command_executor, region, stack_name, scaledown_idletime): +def _test_job_dependencies(remote_command_executor, region, stack_name, scaledown_idletime, max_queue_size): logging.info("Testing cluster doesn't scale when job dependencies are not satisfied") slurm_commands = SlurmCommands(remote_command_executor) result = slurm_commands.submit_command("sleep 60", nodes=1) @@ -113,6 +113,8 @@ def _test_job_dependencies(remote_command_executor, region, stack_name, scaledow assert_that(max(compute_nodes_time_series)).is_equal_to(1) assert_that(asg_capacity_time_series[-1]).is_equal_to(0) assert_that(compute_nodes_time_series[-1]).is_equal_to(0) + _assert_dummy_nodes(remote_command_executor, max_queue_size) + assert_that(_retrieve_slurm_nodes_from_config(remote_command_executor)).is_empty() def _test_cluster_limits(remote_command_executor, max_queue_size, region, asg_name): @@ -136,6 +138,11 @@ def _retrieve_slurm_dummy_nodes_from_config(remote_command_executor): return remote_command_executor.run_remote_command(retrieve_dummy_nodes_command).stdout +def _retrieve_slurm_nodes_from_config(remote_command_executor): + retrieve_dummy_nodes_command = "sudo tail -n +2 /opt/slurm/etc/slurm_parallelcluster_nodes.conf" + return remote_command_executor.run_remote_command(retrieve_dummy_nodes_command).stdout + + def _retrieve_slurm_dummy_nodes(remote_command_executor): retrieve_dummy_nodes_command = "scontrol -F show nodes | grep 'State=FUTURE'" return len(remote_command_executor.run_remote_command(retrieve_dummy_nodes_command).stdout.split("\n")) From 70ce8176a618062f90bd4e4c27218a06aa57834d Mon Sep 17 00:00:00 2001 From: Francesco De Martino Date: Mon, 15 Apr 2019 14:53:21 +0200 Subject: [PATCH 019/121] Integ tests: add variable for max_scaledown_time Signed-off-by: Francesco De Martino --- tests/integration-tests/tests/schedulers/test_slurm.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/integration-tests/tests/schedulers/test_slurm.py b/tests/integration-tests/tests/schedulers/test_slurm.py index 45d35cbbd6..05652d5998 100644 --- a/tests/integration-tests/tests/schedulers/test_slurm.py +++ b/tests/integration-tests/tests/schedulers/test_slurm.py @@ -100,6 +100,7 @@ def _test_job_dependencies(remote_command_executor, region, stack_name, scaledow jobs_execution_time = 1 estimated_scaleup_time = 5 + max_scaledown_time = 10 asg_capacity_time_series, compute_nodes_time_series, timestamps = get_compute_nodes_allocation( scheduler_commands=slurm_commands, region=region, @@ -107,7 +108,7 @@ def _test_job_dependencies(remote_command_executor, region, stack_name, scaledow max_monitoring_time=minutes(jobs_execution_time) + minutes(scaledown_idletime) + minutes(estimated_scaleup_time) - + minutes(10), + + minutes(max_scaledown_time), ) assert_that(max(asg_capacity_time_series)).is_equal_to(1) assert_that(max(compute_nodes_time_series)).is_equal_to(1) From 7ecf396926158d541a099472113536d9c9e38ead Mon Sep 17 00:00:00 2001 From: Enrico Usai Date: Fri, 12 Apr 2019 15:44:11 +0200 Subject: [PATCH 020/121] Integ test: fix update_test by adding missing region to the boto3 call + add instance fixture to be able to skip the test if the instance is not available (e.g. specific partition) + use eu-west-1 region to avoid to overload the us-west-1 Signed-off-by: Enrico Usai --- .../tests/update/test_update.py | 21 ++++++++++--------- .../test_update/pcluster.config.ini | 4 ++-- 2 files changed, 13 insertions(+), 12 deletions(-) diff --git a/tests/integration-tests/tests/update/test_update.py b/tests/integration-tests/tests/update/test_update.py index fb2cbadced..d886325440 100644 --- a/tests/integration-tests/tests/update/test_update.py +++ b/tests/integration-tests/tests/update/test_update.py @@ -23,17 +23,18 @@ PclusterConfig = namedtuple("PclusterConfig", ["max_queue_size", "compute_instance"]) -@pytest.mark.regions(["us-west-1"]) +@pytest.mark.regions(["eu-west-1"]) @pytest.mark.schedulers(["slurm"]) @pytest.mark.oss(["alinux"]) -@pytest.mark.usefixtures("region", "os", "scheduler") -def test_update(region, pcluster_config_reader, clusters_factory): +@pytest.mark.instances(["c5.xlarge"]) +@pytest.mark.usefixtures("os", "scheduler") +def test_update(instance, region, pcluster_config_reader, clusters_factory): """ Test 'pcluster update' command. Grouped all tests in a single function so that cluster can be reused for all of them. """ - init_config = PclusterConfig(max_queue_size=5, compute_instance="c5.xlarge") + init_config = PclusterConfig(max_queue_size=5, compute_instance=instance) cluster, factory = _init_cluster(region, clusters_factory, pcluster_config_reader, init_config) updated_config = PclusterConfig(max_queue_size=10, compute_instance="c4.xlarge") @@ -41,7 +42,7 @@ def test_update(region, pcluster_config_reader, clusters_factory): # test update _test_max_queue(region, cluster.cfn_name, updated_config.max_queue_size) - _test_update_compute_instance_type(cluster, updated_config.compute_instance) + _test_update_compute_instance_type(region, cluster, updated_config.compute_instance) def _init_cluster(region, clusters_factory, pcluster_config_reader, config): @@ -53,7 +54,7 @@ def _init_cluster(region, clusters_factory, pcluster_config_reader, config): # Verify initial settings _test_max_queue(region, cluster.cfn_name, config.max_queue_size) - _test_compute_instance_type(cluster.cfn_name, config.compute_instance) + _test_compute_instance_type(region, cluster.cfn_name, config.compute_instance) return cluster, factory @@ -77,7 +78,7 @@ def _test_max_queue(region, stack_name, queue_size): assert_that(asg_max_size).is_equal_to(queue_size) -def _test_update_compute_instance_type(cluster, new_compute_instance): +def _test_update_compute_instance_type(region, cluster, new_compute_instance): # submit a job to perform a scaling up action and have a new instance number_of_nodes = 2 remote_command_executor = RemoteCommandExecutor(cluster) @@ -91,11 +92,11 @@ def _test_update_compute_instance_type(cluster, new_compute_instance): max_monitoring_time=minutes(estimated_scaleup_time), number_of_nodes=number_of_nodes, ) - _test_compute_instance_type(cluster.cfn_name, new_compute_instance) + _test_compute_instance_type(region, cluster.cfn_name, new_compute_instance) -def _test_compute_instance_type(stack_name, compute_instance_type): - ec2_client = boto3.resource("ec2") +def _test_compute_instance_type(region, stack_name, compute_instance_type): + ec2_client = boto3.resource("ec2", region_name=region) instance_ids = [] instance_types = [] for instance in ec2_client.instances.filter(Filters=[{"Name": "tag:Application", "Values": [stack_name]}]): diff --git a/tests/integration-tests/tests/update/test_update/test_update/pcluster.config.ini b/tests/integration-tests/tests/update/test_update/test_update/pcluster.config.ini index 72d06fd46e..728eceab23 100644 --- a/tests/integration-tests/tests/update/test_update/test_update/pcluster.config.ini +++ b/tests/integration-tests/tests/update/test_update/test_update/pcluster.config.ini @@ -8,8 +8,8 @@ aws_region_name = {{ region }} base_os = {{ os }} key_name = {{ key_name }} vpc_settings = parallelcluster-vpc -scheduler = slurm -master_instance_type = c5.xlarge +scheduler = {{ scheduler }} +master_instance_type = {{ instance }} compute_instance_type = {{ compute_instance }} initial_queue_size = 1 max_queue_size = {{ max_queue_size }} From ee41e675933605ea34149f8a511e99681b54ca81 Mon Sep 17 00:00:00 2001 From: Enrico Usai Date: Fri, 12 Apr 2019 15:53:24 +0200 Subject: [PATCH 021/121] Integ tests: restore use of node_id to have the test file name in the config node.nodeid is: tests_outputs/.../test_slurm.py::test_slurm[c5.xlarge-us-west-1-alinux-slurm].config instead node.name is: tests_outputs/.../test_slurm[c5.xlarge-us-west-1-alinux-slurm].config We changed it to node.name in a previous commit because the "::" characters was a problem for gitbash (windows): (OSError, Invalid argument) Now we are restoring the node_id by replacing the bad characters because the node_id contains both the test file name (test_slurm.py) and the test name (test_slurm), so the configuration file name is unique. Signed-off-by: Enrico Usai --- tests/integration-tests/conftest.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/integration-tests/conftest.py b/tests/integration-tests/conftest.py index 11866cfeed..4bacded4a6 100644 --- a/tests/integration-tests/conftest.py +++ b/tests/integration-tests/conftest.py @@ -176,11 +176,11 @@ def _cluster_factory(cluster_config): def _write_cluster_config_to_outdir(request, cluster_config): out_dir = request.config.getoption("output_dir") os.makedirs( - "{out_dir}/clusters_configs/{test_dir}".format(out_dir=out_dir, test_dir=os.path.dirname(request.node.name)), + "{out_dir}/clusters_configs/{test_dir}".format(out_dir=out_dir, test_dir=os.path.dirname(request.node.nodeid)), exist_ok=True, ) cluster_config_dst = "{out_dir}/clusters_configs/{test_name}.config".format( - out_dir=out_dir, test_name=request.node.name + out_dir=out_dir, test_name=request.node.nodeid.replace("::", "-") ) copyfile(cluster_config, cluster_config_dst) return cluster_config_dst From db492879481e12cba9705ee71ee84742d673f945 Mon Sep 17 00:00:00 2001 From: Enrico Usai Date: Fri, 12 Apr 2019 16:12:55 +0200 Subject: [PATCH 022/121] Integ tests: move update method from ClustersFactory to Cluster object The idea is to give the factory the power to create and destroy the clusters, but let the Cluster the power to manage itself. Signed-off-by: Enrico Usai --- tests/integration-tests/clusters_factory.py | 55 ++++++++----------- tests/integration-tests/conftest.py | 2 +- .../tests/schedulers/test_awsbatch.py | 2 +- .../tests/schedulers/test_slurm.py | 2 +- .../tests/storage/test_ebs.py | 6 +- .../tests/storage/test_fsx_lustre.py | 2 +- .../tests/storage/test_raid.py | 4 +- tests/integration-tests/tests/test_scaling.py | 2 +- .../tests/update/test_update.py | 19 ++++--- 9 files changed, 44 insertions(+), 50 deletions(-) diff --git a/tests/integration-tests/clusters_factory.py b/tests/integration-tests/clusters_factory.py index 83ea213c2e..46bf917312 100644 --- a/tests/integration-tests/clusters_factory.py +++ b/tests/integration-tests/clusters_factory.py @@ -30,10 +30,30 @@ def __init__(self, name, config_file, ssh_key): self.__cfn_outputs = None self.__cfn_resources = None - def update(self): - """Rewrite the configuration file starting from the self.config object.""" - with open(self.config_file, "w") as configfile: - self.config.write(configfile) + def update(self, reset_desired=False, extra_params=None): + """ + Update a cluster with an already updated config. + :param reset_desired: reset the current ASG desired capacity to initial config values + :param extra_params: extra parameters to pass to stack update + """ + # update the cluster + logging.info("Updating cluster {0} with config {1}".format(self.name, self.config_file)) + command = ["pcluster", "update", "--config", self.config_file] + if reset_desired: + command.append("--reset-desired") + if extra_params: + command.extend(["--extra-parameters", extra_params]) + command.append(self.name) + result = run_command(command) + if "Status: {0} - UPDATE_COMPLETE".format(self.cfn_name) not in result.stdout: + error = "Cluster update failed for {0} with output: {1}".format(self.name, result.stdout) + logging.error(error) + raise Exception(error) + logging.info("Cluster {0} updated successfully".format(self.name)) + + # reset cached properties + self.__cfn_outputs = None + self.__cfn_resources = None @property def cfn_name(self): @@ -113,33 +133,6 @@ def create_cluster(self, cluster): logging.info("Sleeping for 60 seconds in case cluster is not ready yet") time.sleep(60) - def update_cluster(self, cluster, reset_desired=False, extra_params=None): - """ - Create a cluster with a given config. - :param cluster: cluster to update. - :param reset_desired: reset the current ASG desired capacity to initial config values - :param extra_params: extra parameters to pass to stack update - """ - name = cluster.name - config = cluster.config_file - - # update the cluster - logging.info("Updating cluster {0} with config {1}".format(name, config)) - self.__created_clusters[name] = cluster - - command = ["pcluster", "update", "--config", config] - if reset_desired: - command.append("--reset-desired") - if extra_params: - command.extend(["--extra-parameters", extra_params]) - command.append(name) - result = run_command(command) - if "Status: {0} - UPDATE_COMPLETE".format(cluster.cfn_name) not in result.stdout: - error = "Cluster update failed for {0} with output: {1}".format(name, result.stdout) - logging.error(error) - raise Exception(error) - logging.info("Cluster {0} updated successfully".format(name)) - @retry(stop_max_attempt_number=10, wait_fixed=5000, retry_on_exception=retry_if_subprocess_error) def destroy_cluster(self, name): """Destroy a created cluster.""" diff --git a/tests/integration-tests/conftest.py b/tests/integration-tests/conftest.py index 4bacded4a6..9ecf62097d 100644 --- a/tests/integration-tests/conftest.py +++ b/tests/integration-tests/conftest.py @@ -167,7 +167,7 @@ def _cluster_factory(cluster_config): ssh_key=request.config.getoption("key_path"), ) factory.create_cluster(cluster) - return cluster, factory + return cluster yield _cluster_factory factory.destroy_all_clusters() diff --git a/tests/integration-tests/tests/schedulers/test_awsbatch.py b/tests/integration-tests/tests/schedulers/test_awsbatch.py index 9621aa6a29..4c278bcf59 100644 --- a/tests/integration-tests/tests/schedulers/test_awsbatch.py +++ b/tests/integration-tests/tests/schedulers/test_awsbatch.py @@ -29,7 +29,7 @@ def test_awsbatch(pcluster_config_reader, clusters_factory, test_datadir): Grouped all tests in a single function so that cluster can be reused for all of them. """ cluster_config = pcluster_config_reader() - cluster, _ = clusters_factory(cluster_config) + cluster = clusters_factory(cluster_config) remote_command_executor = RemoteCommandExecutor(cluster) _test_simple_job_submission(remote_command_executor, test_datadir) diff --git a/tests/integration-tests/tests/schedulers/test_slurm.py b/tests/integration-tests/tests/schedulers/test_slurm.py index 05652d5998..cd8f7322a7 100644 --- a/tests/integration-tests/tests/schedulers/test_slurm.py +++ b/tests/integration-tests/tests/schedulers/test_slurm.py @@ -35,7 +35,7 @@ def test_slurm(region, pcluster_config_reader, clusters_factory): scaledown_idletime = 3 max_queue_size = 5 cluster_config = pcluster_config_reader(scaledown_idletime=scaledown_idletime, max_queue_size=max_queue_size) - cluster, _ = clusters_factory(cluster_config) + cluster = clusters_factory(cluster_config) remote_command_executor = RemoteCommandExecutor(cluster) _test_slurm_version(remote_command_executor) diff --git a/tests/integration-tests/tests/storage/test_ebs.py b/tests/integration-tests/tests/storage/test_ebs.py index 4ea84a0fde..588f69fab9 100644 --- a/tests/integration-tests/tests/storage/test_ebs.py +++ b/tests/integration-tests/tests/storage/test_ebs.py @@ -26,7 +26,7 @@ def test_ebs_single(scheduler, pcluster_config_reader, clusters_factory): mount_dir = "ebs_mount_dir" cluster_config = pcluster_config_reader(mount_dir=mount_dir) - cluster, _ = clusters_factory(cluster_config) + cluster = clusters_factory(cluster_config) remote_command_executor = RemoteCommandExecutor(cluster) mount_dir = "/" + mount_dir @@ -43,7 +43,7 @@ def test_ebs_multiple(scheduler, pcluster_config_reader, clusters_factory): mount_dirs = ["/ebs_mount_dir_{0}".format(i) for i in range(0, 5)] volume_sizes = [15 + 5 * i for i in range(0, 5)] cluster_config = pcluster_config_reader(mount_dirs=mount_dirs, volume_sizes=volume_sizes) - cluster, _ = clusters_factory(cluster_config) + cluster = clusters_factory(cluster_config) remote_command_executor = RemoteCommandExecutor(cluster) scheduler_commands = get_scheduler_commands(scheduler, remote_command_executor) @@ -58,7 +58,7 @@ def test_ebs_multiple(scheduler, pcluster_config_reader, clusters_factory): @pytest.mark.usefixtures("region", "os", "instance") def test_default_ebs(scheduler, pcluster_config_reader, clusters_factory): cluster_config = pcluster_config_reader() - cluster, _ = clusters_factory(cluster_config) + cluster = clusters_factory(cluster_config) remote_command_executor = RemoteCommandExecutor(cluster) mount_dir = "/shared" diff --git a/tests/integration-tests/tests/storage/test_fsx_lustre.py b/tests/integration-tests/tests/storage/test_fsx_lustre.py index 67b0ba55b8..de4a31f93e 100644 --- a/tests/integration-tests/tests/storage/test_fsx_lustre.py +++ b/tests/integration-tests/tests/storage/test_fsx_lustre.py @@ -35,7 +35,7 @@ def test_fsx_lustre(region, pcluster_config_reader, clusters_factory, s3_bucket_ bucket = boto3.resource("s3", region_name=region).Bucket(bucket_name) bucket.upload_file(str(test_datadir / "s3_test_file"), "s3_test_file") cluster_config = pcluster_config_reader(bucket_name=bucket_name, mount_dir=mount_dir) - cluster, _ = clusters_factory(cluster_config) + cluster = clusters_factory(cluster_config) remote_command_executor = RemoteCommandExecutor(cluster) _test_fsx_lustre_correctly_mounted(remote_command_executor, mount_dir) diff --git a/tests/integration-tests/tests/storage/test_raid.py b/tests/integration-tests/tests/storage/test_raid.py index 9d82f70bf3..7d9d4d7205 100644 --- a/tests/integration-tests/tests/storage/test_raid.py +++ b/tests/integration-tests/tests/storage/test_raid.py @@ -25,7 +25,7 @@ @pytest.mark.usefixtures("region", "os", "instance") def test_raid_performance_mode(scheduler, pcluster_config_reader, clusters_factory): cluster_config = pcluster_config_reader() - cluster, _ = clusters_factory(cluster_config) + cluster = clusters_factory(cluster_config) remote_command_executor = RemoteCommandExecutor(cluster) scheduler_commands = get_scheduler_commands(scheduler, remote_command_executor) @@ -41,7 +41,7 @@ def test_raid_performance_mode(scheduler, pcluster_config_reader, clusters_facto @pytest.mark.usefixtures("region", "os", "instance") def test_raid_fault_tolerance_mode(scheduler, pcluster_config_reader, clusters_factory): cluster_config = pcluster_config_reader() - cluster, _ = clusters_factory(cluster_config) + cluster = clusters_factory(cluster_config) remote_command_executor = RemoteCommandExecutor(cluster) scheduler_commands = get_scheduler_commands(scheduler, remote_command_executor) diff --git a/tests/integration-tests/tests/test_scaling.py b/tests/integration-tests/tests/test_scaling.py index 650f8aed8c..24e4d2d218 100644 --- a/tests/integration-tests/tests/test_scaling.py +++ b/tests/integration-tests/tests/test_scaling.py @@ -29,7 +29,7 @@ def test_multiple_jobs_submission(scheduler, region, pcluster_config_reader, clu max_jobs_execution_time = 9 cluster_config = pcluster_config_reader(scaledown_idletime=scaledown_idletime) - cluster, _ = clusters_factory(cluster_config) + cluster = clusters_factory(cluster_config) remote_command_executor = RemoteCommandExecutor(cluster) scheduler_commands = get_scheduler_commands(scheduler, remote_command_executor) diff --git a/tests/integration-tests/tests/update/test_update.py b/tests/integration-tests/tests/update/test_update.py index d886325440..02bfd86a1c 100644 --- a/tests/integration-tests/tests/update/test_update.py +++ b/tests/integration-tests/tests/update/test_update.py @@ -35,10 +35,10 @@ def test_update(instance, region, pcluster_config_reader, clusters_factory): Grouped all tests in a single function so that cluster can be reused for all of them. """ init_config = PclusterConfig(max_queue_size=5, compute_instance=instance) - cluster, factory = _init_cluster(region, clusters_factory, pcluster_config_reader, init_config) + cluster = _init_cluster(region, clusters_factory, pcluster_config_reader, init_config) updated_config = PclusterConfig(max_queue_size=10, compute_instance="c4.xlarge") - _update_cluster(cluster, factory, updated_config) + _update_cluster(cluster, updated_config) # test update _test_max_queue(region, cluster.cfn_name, updated_config.max_queue_size) @@ -50,23 +50,24 @@ def _init_cluster(region, clusters_factory, pcluster_config_reader, config): cluster_config = pcluster_config_reader( max_queue_size=config.max_queue_size, compute_instance=config.compute_instance ) - cluster, factory = clusters_factory(cluster_config) + cluster = clusters_factory(cluster_config) # Verify initial settings _test_max_queue(region, cluster.cfn_name, config.max_queue_size) _test_compute_instance_type(region, cluster.cfn_name, config.compute_instance) - return cluster, factory + return cluster -def _update_cluster(cluster, factory, config): - # change config settings +def _update_cluster(cluster, config): + # change cluster.config settings _update_cluster_property(cluster, "max_queue_size", str(config.max_queue_size)) _update_cluster_property(cluster, "compute_instance_type", config.compute_instance) - # update configuration file - cluster.update() + # rewrite configuration file starting from the updated cluster.config object + with open(cluster.config_file, "w") as configfile: + cluster.config.write(configfile) # update cluster - factory.update_cluster(cluster) + cluster.update() def _update_cluster_property(cluster, property_name, property_value): From 859191d068af02a25d23e17922ff6095d4387606 Mon Sep 17 00:00:00 2001 From: Enrico Usai Date: Mon, 15 Apr 2019 13:05:44 +0200 Subject: [PATCH 023/121] Integ test: minor improvements to the test_update - Rename PclusterConfig to PClusterConfig - Use dimensions in place of markers - Remove unused instance_ids variable Signed-off-by: Enrico Usai --- tests/integration-tests/tests/update/test_update.py | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/tests/integration-tests/tests/update/test_update.py b/tests/integration-tests/tests/update/test_update.py index 02bfd86a1c..7478143e9c 100644 --- a/tests/integration-tests/tests/update/test_update.py +++ b/tests/integration-tests/tests/update/test_update.py @@ -20,13 +20,10 @@ from tests.common.schedulers_common import SlurmCommands from time_utils import minutes -PclusterConfig = namedtuple("PclusterConfig", ["max_queue_size", "compute_instance"]) +PClusterConfig = namedtuple("PClusterConfig", ["max_queue_size", "compute_instance"]) -@pytest.mark.regions(["eu-west-1"]) -@pytest.mark.schedulers(["slurm"]) -@pytest.mark.oss(["alinux"]) -@pytest.mark.instances(["c5.xlarge"]) +@pytest.mark.dimensions("eu-west-1", "c5.xlarge", "alinux", "slurm") @pytest.mark.usefixtures("os", "scheduler") def test_update(instance, region, pcluster_config_reader, clusters_factory): """ @@ -34,10 +31,10 @@ def test_update(instance, region, pcluster_config_reader, clusters_factory): Grouped all tests in a single function so that cluster can be reused for all of them. """ - init_config = PclusterConfig(max_queue_size=5, compute_instance=instance) + init_config = PClusterConfig(max_queue_size=5, compute_instance=instance) cluster = _init_cluster(region, clusters_factory, pcluster_config_reader, init_config) - updated_config = PclusterConfig(max_queue_size=10, compute_instance="c4.xlarge") + updated_config = PClusterConfig(max_queue_size=10, compute_instance="c4.xlarge") _update_cluster(cluster, updated_config) # test update @@ -98,10 +95,8 @@ def _test_update_compute_instance_type(region, cluster, new_compute_instance): def _test_compute_instance_type(region, stack_name, compute_instance_type): ec2_client = boto3.resource("ec2", region_name=region) - instance_ids = [] instance_types = [] for instance in ec2_client.instances.filter(Filters=[{"Name": "tag:Application", "Values": [stack_name]}]): - instance_ids.append(instance.instance_id) instance_types.append(instance.instance_type) assert_that(instance_types).contains(compute_instance_type) From 5535a0192fd1dc0feb1d9c108bef512b7a61669d Mon Sep 17 00:00:00 2001 From: Luca Carrogu Date: Mon, 8 Apr 2019 15:34:04 +0200 Subject: [PATCH 024/121] Bump version to 2.3.2 alpha 1 Signed-off-by: Luca Carrogu --- cli/setup.py | 2 +- cloudformation/aws-parallelcluster.cfn.json | 6 +++--- docs/conf.py | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/cli/setup.py b/cli/setup.py index b744e7cc3e..02c009e035 100644 --- a/cli/setup.py +++ b/cli/setup.py @@ -21,7 +21,7 @@ def readme(): return f.read() -VERSION = "2.3.1" +VERSION = "2.3.2a1" REQUIRES = ["boto3>=1.9.54", "future>=0.16.0,<=0.17.1", "tabulate>=0.8.2,<=0.8.3"] if sys.version_info[:2] == (2, 6): diff --git a/cloudformation/aws-parallelcluster.cfn.json b/cloudformation/aws-parallelcluster.cfn.json index 9c3f51b2ae..2945c970b1 100644 --- a/cloudformation/aws-parallelcluster.cfn.json +++ b/cloudformation/aws-parallelcluster.cfn.json @@ -1,6 +1,6 @@ { "AWSTemplateFormatVersion": "2010-09-09", - "Description": "AWS ParallelCluster Template. Version: aws-parallelcluster-2.3.1", + "Description": "AWS ParallelCluster Template. Version: aws-parallelcluster-2.3.2a1", "Metadata": { "AWS::CloudFormation::Interface": { "ParameterGroups": [ @@ -1384,8 +1384,8 @@ }, "PackagesVersions": { "default": { - "parallelcluster": "2.3.1", - "cookbook": "aws-parallelcluster-cookbook-2.3.1", + "parallelcluster": "2.3.2a1", + "cookbook": "aws-parallelcluster-cookbook-2.3.2", "chef": "14.2.0", "ridley": "5.1.1", "berkshelf": "7.0.4", diff --git a/docs/conf.py b/docs/conf.py index 453e2252ea..95b1cb3a9e 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -53,7 +53,7 @@ # The short X.Y version. version = '2.3' # The full version, including alpha/beta/rc tags. -release = '2.3.1' +release = '2.3.2a1' # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. From 6acd6d6937fbbd7e8b09b10e75a8a8a7ff44e9cc Mon Sep 17 00:00:00 2001 From: Francesco De Martino Date: Tue, 16 Apr 2019 09:56:24 +0200 Subject: [PATCH 025/121] integ tests: run FSx tests in us-east-1 FSx Lustre not available in eu-central-1 Signed-off-by: Francesco De Martino --- tests/integration-tests/tests/storage/test_fsx_lustre.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration-tests/tests/storage/test_fsx_lustre.py b/tests/integration-tests/tests/storage/test_fsx_lustre.py index de4a31f93e..d40bbddf08 100644 --- a/tests/integration-tests/tests/storage/test_fsx_lustre.py +++ b/tests/integration-tests/tests/storage/test_fsx_lustre.py @@ -19,7 +19,7 @@ from tests.common.schedulers_common import SgeCommands -@pytest.mark.regions(["eu-central-1"]) +@pytest.mark.regions(["us-east-1"]) @pytest.mark.instances(["c5.xlarge"]) @pytest.mark.oss(["centos7", "alinux"]) @pytest.mark.schedulers(["sge"]) From 1bf93820f86bf6efa57b1559c31402309265b713 Mon Sep 17 00:00:00 2001 From: ParallelCluster AMI bot Date: Tue, 16 Apr 2019 16:11:20 +0000 Subject: [PATCH 026/121] Update AMI List Build Number 78 aws-parallelcluster-cookbook Git hash: 52e9500e95896640f0b1892445d5a65ff627d410 aws-parallelcluster-node Git hash: 0581c019cb07650f936c2f0305e10287a26aab0b Signed-off-by: ParallelCluster AMI bot --- amis.txt | 192 ++++++++++---------- cloudformation/aws-parallelcluster.cfn.json | 192 ++++++++++---------- 2 files changed, 194 insertions(+), 190 deletions(-) diff --git a/amis.txt b/amis.txt index a4c0fd5dd6..fbd9f2559b 100644 --- a/amis.txt +++ b/amis.txt @@ -1,100 +1,102 @@ # alinux -ap-northeast-1: ami-0af8c1a29f58c3b91 -ap-northeast-2: ami-036c289fda8701f9d -ap-northeast-3: ami-000902aa3082732ce -ap-south-1: ami-00ff6216daa4b0a69 -ap-southeast-1: ami-03b015a13daa9ff8d -ap-southeast-2: ami-0c2528255cc7c4cec -ca-central-1: ami-05bad5df22b9502e5 -cn-north-1: ami-0227bedfc6798cba1 -cn-northwest-1: ami-08143603c5f390f20 -eu-central-1: ami-003262ea853b26050 -eu-north-1: ami-06cac8aed0729f14c -eu-west-1: ami-0691d6d6d4d209e09 -eu-west-2: ami-0d241a5c57ee3421d -eu-west-3: ami-0e59dd1d2794a857c -sa-east-1: ami-07b044055a13cf93e -us-east-1: ami-0f8b01b1377483305 -us-east-2: ami-049afa5b53a7880d8 -us-gov-east-1: ami-02ee5c66a10526bd1 -us-gov-west-1: ami-7da7d01c -us-west-1: ami-02c87842ea944292e -us-west-2: ami-09b457d5cba24514a +ap-northeast-1: ami-08bbc6b440ae39278 +ap-northeast-2: ami-0b94cc099eb2afed1 +ap-northeast-3: ami-08f450d90bf811e75 +ap-south-1: ami-0991980c18e306e8e +ap-southeast-1: ami-0b28cded2d076c436 +ap-southeast-2: ami-0f6bd7f7e505539e1 +ca-central-1: ami-0b680d93b3d56c72a +cn-north-1: ami-0decad57dff4bb3ea +cn-northwest-1: ami-090c0f959ead7a813 +eu-central-1: ami-0f188f96685b31355 +eu-north-1: ami-08053d8c06274f531 +eu-west-1: ami-0fd242bc2abb1b146 +eu-west-2: ami-046b84d0cba5d292f +eu-west-3: ami-0046b758814704b9f +sa-east-1: ami-06cc41ea6409482e5 +us-east-1: ami-02787c6b3550be361 +us-east-2: ami-0c4b18bab302f095b +us-gov-east-1: ami-063828ccbf0ad3cfe +us-gov-west-1: ami-9bd7a6fa +us-west-1: ami-027a3a20855e7be03 +us-west-2: ami-00b4f23ba0a57a141 # centos6 -ap-northeast-1: ami-0476984f547d1f4f2 -ap-northeast-2: ami-06ecb1e81881cd450 -ap-northeast-3: ami-04d195b55ddf56228 -ap-south-1: ami-0b1abd2bf8810487c -ap-southeast-1: ami-0576b4b2db8272abf -ap-southeast-2: ami-09a18baab0a142123 -ca-central-1: ami-0aa03a3f1b737c651 -eu-central-1: ami-092bd9c46746d940b -eu-north-1: ami-07b83433077d8345b -eu-west-1: ami-09880c7e25df69af8 -eu-west-2: ami-0eba961d9f30431b2 -eu-west-3: ami-0d0b243ac76765544 -sa-east-1: ami-0dfdc6ab8bf7935ea -us-east-1: ami-00f71e3be938f3077 -us-east-2: ami-0b29637d31cf774aa -us-west-1: ami-08dc392067bcf9807 -us-west-2: ami-0fa309858f6ce66ee +ap-northeast-1: ami-0b27f204a8adab23b +ap-northeast-2: ami-010f98c99b632ece4 +ap-northeast-3: ami-09dd084db04a10d89 +ap-south-1: ami-0f4b9b7eb6f7c8bf4 +ap-southeast-1: ami-0d756e8aa7628a1d9 +ap-southeast-2: ami-00e287b293a3b7978 +ca-central-1: ami-0f34197ba57c32264 +eu-central-1: ami-0560468e7585063ec +eu-north-1: ami-0b8ee035751765dd5 +eu-west-1: ami-07fb0fe1262b74210 +eu-west-2: ami-00bae65e0cb9fbd31 +eu-west-3: ami-0fa09ce7bae809182 +sa-east-1: ami-014a018823095eb14 +us-east-1: ami-0b0130c133fae6607 +us-east-2: ami-0f4af6aff781edc2e +us-west-1: ami-01fef859674ae14f1 +us-west-2: ami-0c0a6c53fc044e440 # centos7 -ap-northeast-1: ami-0f13f45e966236e46 -ap-northeast-2: ami-016c726d8902d133c -ap-northeast-3: ami-037c3a13cd142c8f8 -ap-south-1: ami-06b7212503b9d9637 -ap-southeast-1: ami-0c39937e9ae643ecd -ap-southeast-2: ami-0164dbfb6b7b938f5 -ca-central-1: ami-0ee7cb4d2673e78de -eu-central-1: ami-0bcced571d9cc0142 -eu-north-1: ami-00255a59ce6bd8147 -eu-west-1: ami-00c07933e0ea22f7d -eu-west-2: ami-09aa34259643c50eb -eu-west-3: ami-04ce6f74e1070a795 -sa-east-1: ami-0a625e9dcf563db57 -us-east-1: ami-0658a809b3e89b0c9 -us-east-2: ami-07cef254f8886ea4e -us-west-1: ami-0454b933360a077e4 -us-west-2: ami-03b7e311ae2f4aacb +ap-northeast-1: ami-0784def0ccb2d5b82 +ap-northeast-2: ami-0d99fb81635d7615c +ap-northeast-3: ami-0c95572f59739c0bf +ap-south-1: ami-010b8728beb0383d6 +ap-southeast-1: ami-06ddf4654231a4d4c +ap-southeast-2: ami-0d1ab341d60525d01 +ca-central-1: ami-092fc6e2504e5f9a2 +eu-central-1: ami-003a4b7027af55f24 +eu-north-1: ami-071f5c908f62cc6cd +eu-west-1: ami-0bbd6b980d3313c64 +eu-west-2: ami-0907c15064d4edba5 +eu-west-3: ami-0125d2751f93718eb +sa-east-1: ami-07454a63f6b9e04de +us-east-1: ami-01f0260a02285b5dc +us-east-2: ami-0f9dab1d43d5744ed +us-west-1: ami-074340af697a98c37 +us-west-2: ami-0811b1738ec473064 # ubuntu1404 -ap-northeast-1: ami-0ce1c5516c087ef8d -ap-northeast-2: ami-0744c53e9582abcd4 -ap-northeast-3: ami-0d0faa548bcca5fac -ap-south-1: ami-00721e9f7f8235dba -ap-southeast-1: ami-03df9d0a89a448c63 -ap-southeast-2: ami-06116e2159f6ba6bf -ca-central-1: ami-0d180013cf3d07fc9 -cn-north-1: ami-0ef85bbc4ba66c301 -eu-central-1: ami-04b116ae9a44c861f -eu-north-1: ami-0de1c666987bbdb1f -eu-west-1: ami-01b114f6a268d6a42 -eu-west-2: ami-0f9ad3c001b80325a -eu-west-3: ami-0f921986737ab8306 -sa-east-1: ami-0d1d30ad051235185 -us-east-1: ami-0422aa8ec2e452870 -us-east-2: ami-02447e477105886bd -us-gov-east-1: ami-03538e53996b83762 -us-gov-west-1: ami-90a2d5f1 -us-west-1: ami-0f4a99f972b9b4882 -us-west-2: ami-04caeb57df33aba89 +ap-northeast-1: ami-0cd539d9e885af044 +ap-northeast-2: ami-0924984889a5d4a46 +ap-northeast-3: ami-07450b1aef436d233 +ap-south-1: ami-05f60d15408d42ac9 +ap-southeast-1: ami-0b1a46b1b9d2f4fb9 +ap-southeast-2: ami-0863af7ce35b4323f +ca-central-1: ami-09bbf1a83ca0f5ff0 +cn-north-1: ami-05f293ab117a7842d +cn-northwest-1: ami-0dc4a48b191a663d1 +eu-central-1: ami-0abac01d1d843346c +eu-north-1: ami-0bf9760df9cfca009 +eu-west-1: ami-091779c5c891c35c2 +eu-west-2: ami-07da7ca913a6ea97f +eu-west-3: ami-0427e1cae6b8e9f89 +sa-east-1: ami-0a228fdac1d498e8c +us-east-1: ami-05dbcb5308069c285 +us-east-2: ami-0a8e709dece19523d +us-gov-east-1: ami-0441f56840dcb67ca +us-gov-west-1: ami-a6d3a2c7 +us-west-1: ami-0e01f060099ea747b +us-west-2: ami-08d70cb46b697c172 # ubuntu1604 -ap-northeast-1: ami-041f6050eff86f024 -ap-northeast-2: ami-0df4c1dafbfee5031 -ap-northeast-3: ami-08d3ef362e1d06e56 -ap-south-1: ami-0ef148f6ae69767d7 -ap-southeast-1: ami-0b63a13236ce5b8d9 -ap-southeast-2: ami-0f5a3072f23556b07 -ca-central-1: ami-0c88262f6fd2738fc -cn-north-1: ami-017ea2a40c48f9af4 -eu-central-1: ami-06a21b6e0815065a4 -eu-north-1: ami-0418320f06192d788 -eu-west-1: ami-0809bc00666e41cfa -eu-west-2: ami-04d8578267aaa2ac4 -eu-west-3: ami-02de781189ccb9f92 -sa-east-1: ami-088d6a838e8dc6b11 -us-east-1: ami-0a8c4ea1bd1ff7651 -us-east-2: ami-04d5c390495e0509f -us-gov-east-1: ami-0bfb76fbbbb68030d -us-gov-west-1: ami-eeaed98f -us-west-1: ami-0a33d79d5f920cc2c -us-west-2: ami-00050b3048393bc12 +ap-northeast-1: ami-04b272dcdd06e1564 +ap-northeast-2: ami-0b3c32fb3a4e9c0ef +ap-northeast-3: ami-01f5ef9fbe48d7bb2 +ap-south-1: ami-0076ff722a254fdca +ap-southeast-1: ami-0a4ea1282cc83d41c +ap-southeast-2: ami-0fe13fe2f6911fa5d +ca-central-1: ami-0a7b7779aba55c024 +cn-north-1: ami-05d35ef899afade2f +cn-northwest-1: ami-0b05d343cc4707a84 +eu-central-1: ami-0ae0084f39e8dce54 +eu-north-1: ami-0383b711246b9c840 +eu-west-1: ami-091dd999747fc79d7 +eu-west-2: ami-0cd1373102c0a4d5d +eu-west-3: ami-0f3bba95cd9c832d1 +sa-east-1: ami-00416702e6c84f6c7 +us-east-1: ami-00de8aa07f24052f6 +us-east-2: ami-0d7f19ca4f88c3044 +us-gov-east-1: ami-035e990927c75e7d8 +us-gov-west-1: ami-e1d4a580 +us-west-1: ami-0f02ce7616b3016af +us-west-2: ami-0b1d55af6440b3b96 diff --git a/cloudformation/aws-parallelcluster.cfn.json b/cloudformation/aws-parallelcluster.cfn.json index 2945c970b1..6d69ef2e4b 100644 --- a/cloudformation/aws-parallelcluster.cfn.json +++ b/cloudformation/aws-parallelcluster.cfn.json @@ -1212,141 +1212,143 @@ "Mappings": { "AWSRegionOS2AMI": { "ap-northeast-1": { - "alinux": "ami-0af8c1a29f58c3b91", - "centos6": "ami-0476984f547d1f4f2", - "centos7": "ami-0f13f45e966236e46", - "ubuntu1404": "ami-0ce1c5516c087ef8d", - "ubuntu1604": "ami-041f6050eff86f024" + "alinux": "ami-08bbc6b440ae39278", + "centos6": "ami-0b27f204a8adab23b", + "centos7": "ami-0784def0ccb2d5b82", + "ubuntu1404": "ami-0cd539d9e885af044", + "ubuntu1604": "ami-04b272dcdd06e1564" }, "ap-northeast-2": { - "alinux": "ami-036c289fda8701f9d", - "centos6": "ami-06ecb1e81881cd450", - "centos7": "ami-016c726d8902d133c", - "ubuntu1404": "ami-0744c53e9582abcd4", - "ubuntu1604": "ami-0df4c1dafbfee5031" + "alinux": "ami-0b94cc099eb2afed1", + "centos6": "ami-010f98c99b632ece4", + "centos7": "ami-0d99fb81635d7615c", + "ubuntu1404": "ami-0924984889a5d4a46", + "ubuntu1604": "ami-0b3c32fb3a4e9c0ef" }, "ap-northeast-3": { - "alinux": "ami-000902aa3082732ce", - "centos6": "ami-04d195b55ddf56228", - "centos7": "ami-037c3a13cd142c8f8", - "ubuntu1404": "ami-0d0faa548bcca5fac", - "ubuntu1604": "ami-08d3ef362e1d06e56" + "alinux": "ami-08f450d90bf811e75", + "centos6": "ami-09dd084db04a10d89", + "centos7": "ami-0c95572f59739c0bf", + "ubuntu1404": "ami-07450b1aef436d233", + "ubuntu1604": "ami-01f5ef9fbe48d7bb2" }, "ap-south-1": { - "alinux": "ami-00ff6216daa4b0a69", - "centos6": "ami-0b1abd2bf8810487c", - "centos7": "ami-06b7212503b9d9637", - "ubuntu1404": "ami-00721e9f7f8235dba", - "ubuntu1604": "ami-0ef148f6ae69767d7" + "alinux": "ami-0991980c18e306e8e", + "centos6": "ami-0f4b9b7eb6f7c8bf4", + "centos7": "ami-010b8728beb0383d6", + "ubuntu1404": "ami-05f60d15408d42ac9", + "ubuntu1604": "ami-0076ff722a254fdca" }, "ap-southeast-1": { - "alinux": "ami-03b015a13daa9ff8d", - "centos6": "ami-0576b4b2db8272abf", - "centos7": "ami-0c39937e9ae643ecd", - "ubuntu1404": "ami-03df9d0a89a448c63", - "ubuntu1604": "ami-0b63a13236ce5b8d9" + "alinux": "ami-0b28cded2d076c436", + "centos6": "ami-0d756e8aa7628a1d9", + "centos7": "ami-06ddf4654231a4d4c", + "ubuntu1404": "ami-0b1a46b1b9d2f4fb9", + "ubuntu1604": "ami-0a4ea1282cc83d41c" }, "ap-southeast-2": { - "alinux": "ami-0c2528255cc7c4cec", - "centos6": "ami-09a18baab0a142123", - "centos7": "ami-0164dbfb6b7b938f5", - "ubuntu1404": "ami-06116e2159f6ba6bf", - "ubuntu1604": "ami-0f5a3072f23556b07" + "alinux": "ami-0f6bd7f7e505539e1", + "centos6": "ami-00e287b293a3b7978", + "centos7": "ami-0d1ab341d60525d01", + "ubuntu1404": "ami-0863af7ce35b4323f", + "ubuntu1604": "ami-0fe13fe2f6911fa5d" }, "ca-central-1": { - "alinux": "ami-05bad5df22b9502e5", - "centos6": "ami-0aa03a3f1b737c651", - "centos7": "ami-0ee7cb4d2673e78de", - "ubuntu1404": "ami-0d180013cf3d07fc9", - "ubuntu1604": "ami-0c88262f6fd2738fc" + "alinux": "ami-0b680d93b3d56c72a", + "centos6": "ami-0f34197ba57c32264", + "centos7": "ami-092fc6e2504e5f9a2", + "ubuntu1404": "ami-09bbf1a83ca0f5ff0", + "ubuntu1604": "ami-0a7b7779aba55c024" }, "cn-north-1": { - "alinux": "ami-0227bedfc6798cba1", - "ubuntu1404": "ami-0ef85bbc4ba66c301", - "ubuntu1604": "ami-017ea2a40c48f9af4" + "alinux": "ami-0decad57dff4bb3ea", + "ubuntu1404": "ami-05f293ab117a7842d", + "ubuntu1604": "ami-05d35ef899afade2f" }, "cn-northwest-1": { - "alinux": "ami-08143603c5f390f20" + "alinux": "ami-090c0f959ead7a813", + "ubuntu1404": "ami-0dc4a48b191a663d1", + "ubuntu1604": "ami-0b05d343cc4707a84" }, "eu-central-1": { - "alinux": "ami-003262ea853b26050", - "centos6": "ami-092bd9c46746d940b", - "centos7": "ami-0bcced571d9cc0142", - "ubuntu1404": "ami-04b116ae9a44c861f", - "ubuntu1604": "ami-06a21b6e0815065a4" + "alinux": "ami-0f188f96685b31355", + "centos6": "ami-0560468e7585063ec", + "centos7": "ami-003a4b7027af55f24", + "ubuntu1404": "ami-0abac01d1d843346c", + "ubuntu1604": "ami-0ae0084f39e8dce54" }, "eu-north-1": { - "alinux": "ami-06cac8aed0729f14c", - "centos6": "ami-07b83433077d8345b", - "centos7": "ami-00255a59ce6bd8147", - "ubuntu1404": "ami-0de1c666987bbdb1f", - "ubuntu1604": "ami-0418320f06192d788" + "alinux": "ami-08053d8c06274f531", + "centos6": "ami-0b8ee035751765dd5", + "centos7": "ami-071f5c908f62cc6cd", + "ubuntu1404": "ami-0bf9760df9cfca009", + "ubuntu1604": "ami-0383b711246b9c840" }, "eu-west-1": { - "alinux": "ami-0691d6d6d4d209e09", - "centos6": "ami-09880c7e25df69af8", - "centos7": "ami-00c07933e0ea22f7d", - "ubuntu1404": "ami-01b114f6a268d6a42", - "ubuntu1604": "ami-0809bc00666e41cfa" + "alinux": "ami-0fd242bc2abb1b146", + "centos6": "ami-07fb0fe1262b74210", + "centos7": "ami-0bbd6b980d3313c64", + "ubuntu1404": "ami-091779c5c891c35c2", + "ubuntu1604": "ami-091dd999747fc79d7" }, "eu-west-2": { - "alinux": "ami-0d241a5c57ee3421d", - "centos6": "ami-0eba961d9f30431b2", - "centos7": "ami-09aa34259643c50eb", - "ubuntu1404": "ami-0f9ad3c001b80325a", - "ubuntu1604": "ami-04d8578267aaa2ac4" + "alinux": "ami-046b84d0cba5d292f", + "centos6": "ami-00bae65e0cb9fbd31", + "centos7": "ami-0907c15064d4edba5", + "ubuntu1404": "ami-07da7ca913a6ea97f", + "ubuntu1604": "ami-0cd1373102c0a4d5d" }, "eu-west-3": { - "alinux": "ami-0e59dd1d2794a857c", - "centos6": "ami-0d0b243ac76765544", - "centos7": "ami-04ce6f74e1070a795", - "ubuntu1404": "ami-0f921986737ab8306", - "ubuntu1604": "ami-02de781189ccb9f92" + "alinux": "ami-0046b758814704b9f", + "centos6": "ami-0fa09ce7bae809182", + "centos7": "ami-0125d2751f93718eb", + "ubuntu1404": "ami-0427e1cae6b8e9f89", + "ubuntu1604": "ami-0f3bba95cd9c832d1" }, "sa-east-1": { - "alinux": "ami-07b044055a13cf93e", - "centos6": "ami-0dfdc6ab8bf7935ea", - "centos7": "ami-0a625e9dcf563db57", - "ubuntu1404": "ami-0d1d30ad051235185", - "ubuntu1604": "ami-088d6a838e8dc6b11" + "alinux": "ami-06cc41ea6409482e5", + "centos6": "ami-014a018823095eb14", + "centos7": "ami-07454a63f6b9e04de", + "ubuntu1404": "ami-0a228fdac1d498e8c", + "ubuntu1604": "ami-00416702e6c84f6c7" }, "us-east-1": { - "alinux": "ami-0f8b01b1377483305", - "centos6": "ami-00f71e3be938f3077", - "centos7": "ami-0658a809b3e89b0c9", - "ubuntu1404": "ami-0422aa8ec2e452870", - "ubuntu1604": "ami-0a8c4ea1bd1ff7651" + "alinux": "ami-02787c6b3550be361", + "centos6": "ami-0b0130c133fae6607", + "centos7": "ami-01f0260a02285b5dc", + "ubuntu1404": "ami-05dbcb5308069c285", + "ubuntu1604": "ami-00de8aa07f24052f6" }, "us-east-2": { - "alinux": "ami-049afa5b53a7880d8", - "centos6": "ami-0b29637d31cf774aa", - "centos7": "ami-07cef254f8886ea4e", - "ubuntu1404": "ami-02447e477105886bd", - "ubuntu1604": "ami-04d5c390495e0509f" + "alinux": "ami-0c4b18bab302f095b", + "centos6": "ami-0f4af6aff781edc2e", + "centos7": "ami-0f9dab1d43d5744ed", + "ubuntu1404": "ami-0a8e709dece19523d", + "ubuntu1604": "ami-0d7f19ca4f88c3044" }, "us-gov-east-1": { - "alinux": "ami-02ee5c66a10526bd1", - "ubuntu1404": "ami-03538e53996b83762", - "ubuntu1604": "ami-0bfb76fbbbb68030d" + "alinux": "ami-063828ccbf0ad3cfe", + "ubuntu1404": "ami-0441f56840dcb67ca", + "ubuntu1604": "ami-035e990927c75e7d8" }, "us-gov-west-1": { - "alinux": "ami-7da7d01c", - "ubuntu1404": "ami-90a2d5f1", - "ubuntu1604": "ami-eeaed98f" + "alinux": "ami-9bd7a6fa", + "ubuntu1404": "ami-a6d3a2c7", + "ubuntu1604": "ami-e1d4a580" }, "us-west-1": { - "alinux": "ami-02c87842ea944292e", - "centos6": "ami-08dc392067bcf9807", - "centos7": "ami-0454b933360a077e4", - "ubuntu1404": "ami-0f4a99f972b9b4882", - "ubuntu1604": "ami-0a33d79d5f920cc2c" + "alinux": "ami-027a3a20855e7be03", + "centos6": "ami-01fef859674ae14f1", + "centos7": "ami-074340af697a98c37", + "ubuntu1404": "ami-0e01f060099ea747b", + "ubuntu1604": "ami-0f02ce7616b3016af" }, "us-west-2": { - "alinux": "ami-09b457d5cba24514a", - "centos6": "ami-0fa309858f6ce66ee", - "centos7": "ami-03b7e311ae2f4aacb", - "ubuntu1404": "ami-04caeb57df33aba89", - "ubuntu1604": "ami-00050b3048393bc12" + "alinux": "ami-00b4f23ba0a57a141", + "centos6": "ami-0c0a6c53fc044e440", + "centos7": "ami-0811b1738ec473064", + "ubuntu1404": "ami-08d70cb46b697c172", + "ubuntu1604": "ami-0b1d55af6440b3b96" } }, "OSFeatures": { From 1fe8d2024ab61950f44e454175ef22a1f57b5ac9 Mon Sep 17 00:00:00 2001 From: cfncluster-ami-bot <39313383+cfncluster-ami-bot@users.noreply.github.com> Date: Tue, 16 Apr 2019 18:14:05 +0200 Subject: [PATCH 027/121] Remove Ubuntu from cn-northwest-1 --- amis.txt | 2 -- 1 file changed, 2 deletions(-) diff --git a/amis.txt b/amis.txt index fbd9f2559b..ba448a7542 100644 --- a/amis.txt +++ b/amis.txt @@ -65,7 +65,6 @@ ap-southeast-1: ami-0b1a46b1b9d2f4fb9 ap-southeast-2: ami-0863af7ce35b4323f ca-central-1: ami-09bbf1a83ca0f5ff0 cn-north-1: ami-05f293ab117a7842d -cn-northwest-1: ami-0dc4a48b191a663d1 eu-central-1: ami-0abac01d1d843346c eu-north-1: ami-0bf9760df9cfca009 eu-west-1: ami-091779c5c891c35c2 @@ -87,7 +86,6 @@ ap-southeast-1: ami-0a4ea1282cc83d41c ap-southeast-2: ami-0fe13fe2f6911fa5d ca-central-1: ami-0a7b7779aba55c024 cn-north-1: ami-05d35ef899afade2f -cn-northwest-1: ami-0b05d343cc4707a84 eu-central-1: ami-0ae0084f39e8dce54 eu-north-1: ami-0383b711246b9c840 eu-west-1: ami-091dd999747fc79d7 From 7d0af798767c87d2515b6ea933de247424231bd5 Mon Sep 17 00:00:00 2001 From: cfncluster-ami-bot <39313383+cfncluster-ami-bot@users.noreply.github.com> Date: Tue, 16 Apr 2019 18:14:39 +0200 Subject: [PATCH 028/121] Remove Ubuntu from cn-northwest-1 --- cloudformation/aws-parallelcluster.cfn.json | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/cloudformation/aws-parallelcluster.cfn.json b/cloudformation/aws-parallelcluster.cfn.json index 6d69ef2e4b..4a75856a80 100644 --- a/cloudformation/aws-parallelcluster.cfn.json +++ b/cloudformation/aws-parallelcluster.cfn.json @@ -1266,9 +1266,7 @@ "ubuntu1604": "ami-05d35ef899afade2f" }, "cn-northwest-1": { - "alinux": "ami-090c0f959ead7a813", - "ubuntu1404": "ami-0dc4a48b191a663d1", - "ubuntu1604": "ami-0b05d343cc4707a84" + "alinux": "ami-090c0f959ead7a813" }, "eu-central-1": { "alinux": "ami-0f188f96685b31355", From 91e3d717b00ecbc2085283bb472cef959f762ba0 Mon Sep 17 00:00:00 2001 From: Francesco De Martino Date: Tue, 16 Apr 2019 17:39:30 +0200 Subject: [PATCH 029/121] integ tests: add retry for vpc stack failures This mitigates failures due to resources not available in randomly picked AZs. Signed-off-by: Francesco De Martino --- tests/integration-tests/cfn_stacks_factory.py | 2 +- tests/integration-tests/conftest.py | 14 +++++++++++--- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/tests/integration-tests/cfn_stacks_factory.py b/tests/integration-tests/cfn_stacks_factory.py index 6eec8ac3e8..25d8fb40c2 100644 --- a/tests/integration-tests/cfn_stacks_factory.py +++ b/tests/integration-tests/cfn_stacks_factory.py @@ -47,8 +47,8 @@ def __init__(self): def create_stack(self, stack): """ Create a cfn stack with a given template. + :param stack: stack to create. - :return: """ name = stack.name region = stack.region diff --git a/tests/integration-tests/conftest.py b/tests/integration-tests/conftest.py index 9ecf62097d..a3ed6f68ba 100644 --- a/tests/integration-tests/conftest.py +++ b/tests/integration-tests/conftest.py @@ -22,6 +22,7 @@ import configparser import pytest +from retrying import retry from cfn_stacks_factory import CfnStack, CfnStacksFactory from clusters_factory import Cluster, ClustersFactory @@ -311,13 +312,20 @@ def vpc_stacks(cfn_stacks_factory, request): ) vpc_config = VPCConfig(subnets=[public_subnet, private_subnet]) template = VPCTemplateBuilder(vpc_config).build() - stack = CfnStack(name="integ-tests-vpc-" + random_alphanumeric(), region=region, template=template.to_json()) - cfn_stacks_factory.create_stack(stack) - vpc_stacks[region] = stack + vpc_stacks[region] = _create_vpc_stack(template, region) return vpc_stacks +# If stack creation fails it'll retry once more. This is done to mitigate failures due to resources +# not available in randomly picked AZs. +@retry(stop_max_attempt_number=2, wait_fixed=5000) +def _create_vpc_stack(template, region): + stack = CfnStack(name="integ-tests-vpc-" + random_alphanumeric(), region=region, template=template.to_json()) + cfn_stacks_factory.create_stack(stack) + return stack + + @pytest.fixture(scope="function") def s3_bucket_factory(region): """ From 2e1783fe5ff7dcc87e23bfa4ce4cc1d36f6d314c Mon Sep 17 00:00:00 2001 From: Francesco De Martino Date: Tue, 16 Apr 2019 18:31:08 +0200 Subject: [PATCH 030/121] integ tests: skip encrypted ebs tests in china since KMS not available Signed-off-by: Francesco De Martino --- tests/integration-tests/tests/storage/test_ebs.py | 5 +++-- .../storage/test_ebs/test_ebs_single/pcluster.config.ini | 1 - 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/integration-tests/tests/storage/test_ebs.py b/tests/integration-tests/tests/storage/test_ebs.py index 588f69fab9..a71252cd0d 100644 --- a/tests/integration-tests/tests/storage/test_ebs.py +++ b/tests/integration-tests/tests/storage/test_ebs.py @@ -35,7 +35,8 @@ def test_ebs_single(scheduler, pcluster_config_reader, clusters_factory): _test_ebs_correctly_shared(remote_command_executor, mount_dir, scheduler_commands) -@pytest.mark.regions(["us-east-1", "cn-north-1", "us-gov-east-1"]) +# cn-north-1 does not support KMS +@pytest.mark.regions(["us-east-1", "us-gov-east-1"]) @pytest.mark.instances(["c5.xlarge"]) @pytest.mark.schedulers(["sge", "awsbatch"]) @pytest.mark.usefixtures("region", "os", "instance") @@ -53,7 +54,7 @@ def test_ebs_multiple(scheduler, pcluster_config_reader, clusters_factory): @pytest.mark.regions(["eu-west-2", "cn-northwest-1", "us-gov-west-1"]) -@pytest.mark.instances(["c5.xlarge"]) +@pytest.mark.instances(["c4.xlarge", "c5.xlarge"]) @pytest.mark.schedulers(["sge", "awsbatch"]) @pytest.mark.usefixtures("region", "os", "instance") def test_default_ebs(scheduler, pcluster_config_reader, clusters_factory): diff --git a/tests/integration-tests/tests/storage/test_ebs/test_ebs_single/pcluster.config.ini b/tests/integration-tests/tests/storage/test_ebs/test_ebs_single/pcluster.config.ini index 70c62e03f4..995998e018 100644 --- a/tests/integration-tests/tests/storage/test_ebs/test_ebs_single/pcluster.config.ini +++ b/tests/integration-tests/tests/storage/test_ebs/test_ebs_single/pcluster.config.ini @@ -29,4 +29,3 @@ compute_subnet_id = {{ private_subnet_id }} shared_dir = {{ mount_dir }} volume_type = io1 volume_iops = 210 -encrypted = true From 94733413c59fb93ca430545c99775d0e8d3fa765 Mon Sep 17 00:00:00 2001 From: Francesco De Martino Date: Tue, 16 Apr 2019 18:33:04 +0200 Subject: [PATCH 031/121] integ tests: skipping ubuntu tests in cn-northwest-1 Due to aws-cfn-bootstrap missing in Ningxia region Signed-off-by: Francesco De Martino --- tests/integration-tests/tests/test_scaling.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/integration-tests/tests/test_scaling.py b/tests/integration-tests/tests/test_scaling.py index 24e4d2d218..d3d54504c6 100644 --- a/tests/integration-tests/tests/test_scaling.py +++ b/tests/integration-tests/tests/test_scaling.py @@ -21,6 +21,8 @@ @pytest.mark.skip_schedulers(["awsbatch"]) +@pytest.mark.skip_dimensions("cn-northwest-1", "*", "ubuntu1404", "*") # aws-cfn-bootstrap missing in Ningxia region +@pytest.mark.skip_dimensions("cn-northwest-1", "*", "ubuntu1604", "*") # aws-cfn-bootstrap missing in Ningxia region @pytest.mark.usefixtures("region", "os", "instance") def test_multiple_jobs_submission(scheduler, region, pcluster_config_reader, clusters_factory, test_datadir): scaledown_idletime = 4 From 52b426415a19447874443aca39b118d87acc3a7d Mon Sep 17 00:00:00 2001 From: Francesco De Martino Date: Tue, 16 Apr 2019 18:36:32 +0200 Subject: [PATCH 032/121] integ tests: disabling tests for centos7-torque until bug is solved Issue https://github.com/aws/aws-parallelcluster/issues/875 Signed-off-by: Francesco De Martino --- tests/integration-tests/tests/test_scaling.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/integration-tests/tests/test_scaling.py b/tests/integration-tests/tests/test_scaling.py index d3d54504c6..568663460d 100644 --- a/tests/integration-tests/tests/test_scaling.py +++ b/tests/integration-tests/tests/test_scaling.py @@ -23,6 +23,7 @@ @pytest.mark.skip_schedulers(["awsbatch"]) @pytest.mark.skip_dimensions("cn-northwest-1", "*", "ubuntu1404", "*") # aws-cfn-bootstrap missing in Ningxia region @pytest.mark.skip_dimensions("cn-northwest-1", "*", "ubuntu1604", "*") # aws-cfn-bootstrap missing in Ningxia region +@pytest.mark.skip_dimensions("*", "*", "centos7", "torque") # https://github.com/aws/aws-parallelcluster/issues/875 @pytest.mark.usefixtures("region", "os", "instance") def test_multiple_jobs_submission(scheduler, region, pcluster_config_reader, clusters_factory, test_datadir): scaledown_idletime = 4 From be8bde1532f654b06e222b376cf8c5a4b2f7fb5c Mon Sep 17 00:00:00 2001 From: Enrico Usai Date: Wed, 17 Apr 2019 16:22:33 +0200 Subject: [PATCH 033/121] Integ tests: test s3_read_(write_)?_resource update We are verifing that a customer is able to update the s3_read_resource and s3_read_write_resource configuration parameters Signed-off-by: Enrico Usai --- .../tests/update/test_update.py | 49 +++++++++++++++++-- .../test_update/pcluster.config.ini | 2 + 2 files changed, 47 insertions(+), 4 deletions(-) diff --git a/tests/integration-tests/tests/update/test_update.py b/tests/integration-tests/tests/update/test_update.py index 7478143e9c..85692fec50 100644 --- a/tests/integration-tests/tests/update/test_update.py +++ b/tests/integration-tests/tests/update/test_update.py @@ -20,7 +20,9 @@ from tests.common.schedulers_common import SlurmCommands from time_utils import minutes -PClusterConfig = namedtuple("PClusterConfig", ["max_queue_size", "compute_instance"]) +PClusterConfig = namedtuple( + "PClusterConfig", ["max_queue_size", "compute_instance", "s3_read_resource", "s3_read_write_resource"] +) @pytest.mark.dimensions("eu-west-1", "c5.xlarge", "alinux", "slurm") @@ -31,27 +33,43 @@ def test_update(instance, region, pcluster_config_reader, clusters_factory): Grouped all tests in a single function so that cluster can be reused for all of them. """ - init_config = PClusterConfig(max_queue_size=5, compute_instance=instance) + s3_arn = "arn:aws:s3:::fake_bucket/*" + init_config = PClusterConfig( + max_queue_size=5, compute_instance=instance, s3_read_resource=s3_arn, s3_read_write_resource=s3_arn + ) cluster = _init_cluster(region, clusters_factory, pcluster_config_reader, init_config) - updated_config = PClusterConfig(max_queue_size=10, compute_instance="c4.xlarge") + s3_arn_updated = "arn:aws:s3:::fake_bucket/fake_folder/*" + updated_config = PClusterConfig( + max_queue_size=10, + compute_instance="c4.xlarge", + s3_read_resource=s3_arn_updated, + s3_read_write_resource=s3_arn_updated, + ) _update_cluster(cluster, updated_config) # test update _test_max_queue(region, cluster.cfn_name, updated_config.max_queue_size) _test_update_compute_instance_type(region, cluster, updated_config.compute_instance) + _test_s3_read_resource(region, cluster, updated_config.s3_read_resource) + _test_s3_read_write_resource(region, cluster, updated_config.s3_read_write_resource) def _init_cluster(region, clusters_factory, pcluster_config_reader, config): # read configuration and create cluster cluster_config = pcluster_config_reader( - max_queue_size=config.max_queue_size, compute_instance=config.compute_instance + max_queue_size=config.max_queue_size, + compute_instance=config.compute_instance, + s3_read_resource=config.s3_read_resource, + s3_read_write_resource=config.s3_read_write_resource, ) cluster = clusters_factory(cluster_config) # Verify initial settings _test_max_queue(region, cluster.cfn_name, config.max_queue_size) _test_compute_instance_type(region, cluster.cfn_name, config.compute_instance) + _test_s3_read_resource(region, cluster, config.s3_read_resource) + _test_s3_read_write_resource(region, cluster, config.s3_read_write_resource) return cluster @@ -60,6 +78,8 @@ def _update_cluster(cluster, config): # change cluster.config settings _update_cluster_property(cluster, "max_queue_size", str(config.max_queue_size)) _update_cluster_property(cluster, "compute_instance_type", config.compute_instance) + _update_cluster_property(cluster, "s3_read_resource", config.s3_read_resource) + _update_cluster_property(cluster, "s3_read_write_resource", config.s3_read_write_resource) # rewrite configuration file starting from the updated cluster.config object with open(cluster.config_file, "w") as configfile: cluster.config.write(configfile) @@ -100,3 +120,24 @@ def _test_compute_instance_type(region, stack_name, compute_instance_type): instance_types.append(instance.instance_type) assert_that(instance_types).contains(compute_instance_type) + + +def _test_policy_statement(region, cluster, policy_name, policy_statement): + iam_client = boto3.client("iam", region_name=region) + root_role = cluster.cfn_resources.get("RootRole") + + statement = ( + iam_client.get_role_policy(RoleName=root_role, PolicyName=policy_name) + .get("PolicyDocument") + .get("Statement")[0] + .get("Resource")[0] + ) + assert_that(statement).is_equal_to(policy_statement) + + +def _test_s3_read_resource(region, cluster, s3_arn): + _test_policy_statement(region, cluster, "S3Read", s3_arn) + + +def _test_s3_read_write_resource(region, cluster, s3_arn): + _test_policy_statement(region, cluster, "S3ReadWrite", s3_arn) diff --git a/tests/integration-tests/tests/update/test_update/test_update/pcluster.config.ini b/tests/integration-tests/tests/update/test_update/test_update/pcluster.config.ini index 728eceab23..64c827c385 100644 --- a/tests/integration-tests/tests/update/test_update/test_update/pcluster.config.ini +++ b/tests/integration-tests/tests/update/test_update/test_update/pcluster.config.ini @@ -15,6 +15,8 @@ initial_queue_size = 1 max_queue_size = {{ max_queue_size }} maintain_initial_size = true scaling_settings = custom +s3_read_resource = {{ s3_read_resource }} +s3_read_write_resource = {{ s3_read_write_resource }} [scaling custom] scaledown_idletime = 3 From 31f429cc6632eb282191748168ec17a0c0a8e7a5 Mon Sep 17 00:00:00 2001 From: Enrico Usai Date: Wed, 17 Apr 2019 16:23:07 +0200 Subject: [PATCH 034/121] Integ tests: minor internal rename It is a boto3 resource not a boto3 client. Signed-off-by: Enrico Usai --- tests/integration-tests/tests/update/test_update.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/integration-tests/tests/update/test_update.py b/tests/integration-tests/tests/update/test_update.py index 85692fec50..36d5a7c2a3 100644 --- a/tests/integration-tests/tests/update/test_update.py +++ b/tests/integration-tests/tests/update/test_update.py @@ -114,9 +114,9 @@ def _test_update_compute_instance_type(region, cluster, new_compute_instance): def _test_compute_instance_type(region, stack_name, compute_instance_type): - ec2_client = boto3.resource("ec2", region_name=region) + ec2_resource = boto3.resource("ec2", region_name=region) instance_types = [] - for instance in ec2_client.instances.filter(Filters=[{"Name": "tag:Application", "Values": [stack_name]}]): + for instance in ec2_resource.instances.filter(Filters=[{"Name": "tag:Application", "Values": [stack_name]}]): instance_types.append(instance.instance_type) assert_that(instance_types).contains(compute_instance_type) From ef0b1064a91eacefa1cdf4426a71b254b219fcbb Mon Sep 17 00:00:00 2001 From: Sean Smith Date: Wed, 17 Apr 2019 12:56:20 -0700 Subject: [PATCH 035/121] Add missing parameter --- tests/integration-tests/conftest.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/integration-tests/conftest.py b/tests/integration-tests/conftest.py index a3ed6f68ba..3c16fbd50c 100644 --- a/tests/integration-tests/conftest.py +++ b/tests/integration-tests/conftest.py @@ -312,7 +312,7 @@ def vpc_stacks(cfn_stacks_factory, request): ) vpc_config = VPCConfig(subnets=[public_subnet, private_subnet]) template = VPCTemplateBuilder(vpc_config).build() - vpc_stacks[region] = _create_vpc_stack(template, region) + vpc_stacks[region] = _create_vpc_stack(template, region, cfn_stacks_factory) return vpc_stacks @@ -320,7 +320,7 @@ def vpc_stacks(cfn_stacks_factory, request): # If stack creation fails it'll retry once more. This is done to mitigate failures due to resources # not available in randomly picked AZs. @retry(stop_max_attempt_number=2, wait_fixed=5000) -def _create_vpc_stack(template, region): +def _create_vpc_stack(template, region, cfn_stacks_factory): stack = CfnStack(name="integ-tests-vpc-" + random_alphanumeric(), region=region, template=template.to_json()) cfn_stacks_factory.create_stack(stack) return stack From 4c2ac3e7941301ec7e481ebf7d0b7854b8c445f0 Mon Sep 17 00:00:00 2001 From: Sean Smith Date: Tue, 16 Apr 2019 16:25:22 -0700 Subject: [PATCH 036/121] Use LaunchTemplates in the GovCloud region Signed-off-by: Sean Smith --- cloudformation/aws-parallelcluster.cfn.json | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/cloudformation/aws-parallelcluster.cfn.json b/cloudformation/aws-parallelcluster.cfn.json index 4a75856a80..2c4c92c8b8 100644 --- a/cloudformation/aws-parallelcluster.cfn.json +++ b/cloudformation/aws-parallelcluster.cfn.json @@ -1150,14 +1150,7 @@ "CreateLaunchConfig": { "Fn::And": [ { - "Fn::Or": [ - { - "Condition": "ChinaRegion" - }, - { - "Condition": "GovCloudRegion" - } - ] + "Condition": "ChinaRegion" }, { "Condition": "CreateComputeFleet" From 6df2e627a9a14a0133535afb57dd170d18bd1852 Mon Sep 17 00:00:00 2001 From: Sean Smith Date: Tue, 16 Apr 2019 17:08:37 -0700 Subject: [PATCH 037/121] Remove Launch Configurations * Move to Launch Configurations everywhere Signed-off-by: Sean Smith --- cloudformation/aws-parallelcluster.cfn.json | 714 +------------------- 1 file changed, 9 insertions(+), 705 deletions(-) diff --git a/cloudformation/aws-parallelcluster.cfn.json b/cloudformation/aws-parallelcluster.cfn.json index 2c4c92c8b8..89adc24458 100644 --- a/cloudformation/aws-parallelcluster.cfn.json +++ b/cloudformation/aws-parallelcluster.cfn.json @@ -1118,45 +1118,6 @@ "aws-us-gov" ] }, - "ChinaRegion": { - "Fn::Equals": [ - { - "Ref": "AWS::Partition" - }, - "aws-cn" - ] - }, - "CreateLaunchTemplate": { - "Fn::And": [ - { - "Fn::Not": [ - { - "Fn::Or": [ - { - "Condition": "ChinaRegion" - }, - { - "Condition": "GovCloudRegion" - } - ] - } - ] - }, - { - "Condition": "CreateComputeFleet" - } - ] - }, - "CreateLaunchConfig": { - "Fn::And": [ - { - "Condition": "ChinaRegion" - }, - { - "Condition": "CreateComputeFleet" - } - ] - }, "CreateComputeFleet": { "Fn::Not": [ { @@ -2623,35 +2584,16 @@ ] } ], - "LaunchConfigurationName": { - "Fn::If": [ - "CreateLaunchConfig", - { - "Ref": "ComputeServerLaunchConfig" - }, - { - "Ref": "AWS::NoValue" - } - ] - }, "LaunchTemplate": { - "Fn::If": [ - "CreateLaunchTemplate", - { - "LaunchTemplateId": { - "Ref": "ComputeServerLaunchTemplate" - }, - "Version": { - "Fn::GetAtt": [ - "ComputeServerLaunchTemplate", - "LatestVersionNumber" - ] - } - }, - { - "Ref": "AWS::NoValue" - } - ] + "LaunchTemplateId": { + "Ref": "ComputeServerLaunchTemplate" + }, + "Version": { + "Fn::GetAtt": [ + "ComputeServerLaunchTemplate", + "LatestVersionNumber" + ] + } }, "MinSize": { "Ref": "MinSize" @@ -2757,643 +2699,6 @@ }, "Condition": "CreateComputeFleet" }, - "ComputeServerLaunchConfig": { - "Type": "AWS::AutoScaling::LaunchConfiguration", - "Condition": "CreateLaunchConfig", - "Properties": { - "SecurityGroups": [ - { - "Fn::If": [ - "CreateSecurityGroups", - { - "Ref": "ComputeSecurityGroup" - }, - { - "Ref": "AWS::NoValue" - } - ] - }, - { - "Fn::If": [ - "AddAdditionalSG", - { - "Ref": "AdditionalSG" - }, - { - "Ref": "AWS::NoValue" - } - ] - }, - { - "Fn::If": [ - "UseExistingSecurityGroup", - { - "Ref": "VPCSecurityGroupId" - }, - { - "Ref": "AWS::NoValue" - } - ] - } - ], - "AssociatePublicIpAddress": { - "Fn::If": [ - "ComputePublicIps", - true, - false - ] - }, - "InstanceType": { - "Ref": "ComputeInstanceType" - }, - "KeyName": { - "Ref": "KeyName" - }, - "IamInstanceProfile": { - "Ref": "RootInstanceProfile" - }, - "SpotPrice": { - "Fn::If": [ - "UseSpotInstances", - { - "Ref": "SpotPrice" - }, - { - "Ref": "AWS::NoValue" - } - ] - }, - "ImageId": { - "Fn::If": [ - "UseCustomAMI", - { - "Ref": "CustomAMI" - }, - { - "Fn::FindInMap": [ - "AWSRegionOS2AMI", - { - "Ref": "AWS::Region" - }, - { - "Ref": "BaseOS" - } - ] - } - ] - }, - "InstanceMonitoring": false, - "BlockDeviceMappings": [ - { - "DeviceName": "/dev/xvdba", - "VirtualName": "ephemeral0" - }, - { - "DeviceName": "/dev/xvdbb", - "VirtualName": "ephemeral1" - }, - { - "DeviceName": "/dev/xvdbc", - "VirtualName": "ephemeral2" - }, - { - "DeviceName": "/dev/xvdbd", - "VirtualName": "ephemeral3" - }, - { - "DeviceName": "/dev/xvdbe", - "VirtualName": "ephemeral4" - }, - { - "DeviceName": "/dev/xvdbf", - "VirtualName": "ephemeral5" - }, - { - "DeviceName": "/dev/xvdbg", - "VirtualName": "ephemeral6" - }, - { - "DeviceName": "/dev/xvdbh", - "VirtualName": "ephemeral7" - }, - { - "DeviceName": "/dev/xvdbi", - "VirtualName": "ephemeral8" - }, - { - "DeviceName": "/dev/xvdbj", - "VirtualName": "ephemeral9" - }, - { - "DeviceName": "/dev/xvdbk", - "VirtualName": "ephemeral10" - }, - { - "DeviceName": "/dev/xvdbl", - "VirtualName": "ephemeral11" - }, - { - "DeviceName": "/dev/xvdbm", - "VirtualName": "ephemeral12" - }, - { - "DeviceName": "/dev/xvdbn", - "VirtualName": "ephemeral13" - }, - { - "DeviceName": "/dev/xvdbo", - "VirtualName": "ephemeral14" - }, - { - "DeviceName": "/dev/xvdbp", - "VirtualName": "ephemeral15" - }, - { - "DeviceName": "/dev/xvdbq", - "VirtualName": "ephemeral16" - }, - { - "DeviceName": "/dev/xvdbr", - "VirtualName": "ephemeral17" - }, - { - "DeviceName": "/dev/xvdbs", - "VirtualName": "ephemeral18" - }, - { - "DeviceName": "/dev/xvdbt", - "VirtualName": "ephemeral19" - }, - { - "DeviceName": "/dev/xvdbu", - "VirtualName": "ephemeral20" - }, - { - "DeviceName": "/dev/xvdbv", - "VirtualName": "ephemeral21" - }, - { - "DeviceName": "/dev/xvdbw", - "VirtualName": "ephemeral22" - }, - { - "DeviceName": "/dev/xvdbx", - "VirtualName": "ephemeral23" - }, - { - "DeviceName": { - "Fn::FindInMap": [ - "OSFeatures", - { - "Ref": "BaseOS" - }, - "RootDevice" - ] - }, - "Ebs": { - "VolumeSize": { - "Ref": "ComputeRootVolumeSize" - }, - "VolumeType": "gp2" - } - } - ], - "PlacementTenancy": { - "Fn::If": [ - "UseDedicatedTenancy", - { - "Ref": "Tenancy" - }, - { - "Ref": "AWS::NoValue" - } - ] - }, - "UserData": { - "Fn::Base64": { - "Fn::Join": [ - "", - [ - "Content-Type: multipart/mixed; boundary=\"==BOUNDARY==\"\n", - "MIME-Version: 1.0\n\n", - "--==BOUNDARY==\n", - "Content-Type: text/cloud-config; charset=\"us-ascii\"\n", - "MIME-Version: 1.0\n\n", - "#cloud-config:\n", - "runcmd:\n", - " - [ sh, -c, 'which yum && echo \"proxy=", - { - "Fn::If": [ - "UseProxy", - { - "Ref": "ProxyServer" - }, - "_none_" - ] - }, - "\" >> /etc/yum.conf || echo \"Not yum system\"' ]\n", - " - [ sh, -c, 'which apt-get && echo \"Acquire::http::Proxy \\\"", - { - "Fn::If": [ - "UseProxy", - { - "Ref": "ProxyServer" - }, - "false" - ] - }, - "\\\";\" >> /etc/apt/apt.conf || echo \"Not apt system\"' ]\n", - "--==BOUNDARY==\n", - "Content-Type: text/x-shellscript; charset=\"us-ascii\"\n", - "MIME-Version: 1.0\n\n", - "#!/bin/bash -x\n\n", - "function error_exit\n", - "{\n", - " cfn-signal ${proxy_args} --exit-code=1 --reason=\"$1\" --stack=", - { - "Ref": "AWS::StackName" - }, - " --resource=ComputeFleet --region=", - { - "Ref": "AWS::Region" - }, - "\n", - " exit 1\n", - "}\n", - "function vendor_cookbook\n", - "{\n", - " mkdir /tmp/cookbooks\n", - " cd /tmp/cookbooks\n", - " tar -xzf /etc/chef/aws-parallelcluster-cookbook.tgz\n", - " HOME_BAK=${HOME}\n", - " export HOME=\"/tmp\"\n", - " . /tmp/proxy.sh; for d in `ls /tmp/cookbooks`; do cd /tmp/cookbooks/$d;LANG=en_US.UTF-8 /opt/chef/embedded/bin/berks vendor /etc/chef/cookbooks --delete; done;\n", - " export HOME=${HOME_BAK}\n", - "}\n", - "function bootstrap_instance\n", - "{\n", - " which yum 2>/dev/null; yum=$?\n", - " which apt-get 2>/dev/null; apt=$?\n", - " if [ \"${yum}\" == \"0\" ]; then\n", - " yum -y groupinstall development && yum -y install curl wget jq\n", - " fi\n", - " if [ \"${apt}\" == \"0\" ]; then\n", - " apt-cache search build-essential; apt-get clean; apt-get update; apt-get -y install build-essential curl wget jq\n", - " fi\n", - " [[ ${_region} =~ ^cn- ]] && s3_url=\"cn-north-1.amazonaws.com.cn\"", - " which cfn-init 2>/dev/null || ( curl -s -L -o /tmp/aws-cfn-bootstrap-latest.tar.gz https://s3.${s3_url}/cloudformation-examples/aws-cfn-bootstrap-latest.tar.gz; easy_install -U /tmp/aws-cfn-bootstrap-latest.tar.gz)\n", - " mkdir -p /etc/chef && chown -R root:root /etc/chef\n", - " curl --retry 3 -L https://www.chef.io/chef/install.sh | bash -s -- -v ${chef_version}\n", - " /opt/chef/embedded/bin/gem install --no-rdoc --no-ri ridley:${ridley_version} berkshelf:${berkshelf_version}\n", - " curl --retry 3 -s -L -o /etc/chef/aws-parallelcluster-cookbook.tgz ${cookbook_url}\n", - " curl --retry 3 -s -L -o /etc/chef/aws-parallelcluster-cookbook.tgz.date ${cookbook_url}.date\n", - " curl --retry 3 -s -L -o /etc/chef/aws-parallelcluster-cookbook.tgz.md5 ${cookbook_url}.md5\n", - " vendor_cookbook\n", - " mkdir /opt/parallelcluster && echo ${parallelcluster_version} | tee /opt/parallelcluster/.bootstrapped\n", - "}\n", - "proxy=", - { - "Ref": "ProxyServer" - }, - "\n", - "custom_cookbook=", - { - "Ref": "CustomChefCookbook" - }, - "\n", - "if [ \"${proxy}\" != \"NONE\" ]; then\n", - " proxy_args=\"--http-proxy=${proxy} --https-proxy=${proxy}\"\n", - " proxy_host=$(echo \"${proxy}\" | awk -F/ '{print $3}' | cut -d: -f1)\n", - " proxy_port=$(echo \"${proxy}\" | awk -F/ '{print $3}' | cut -d: -f2)\n", - " export http_proxy=${proxy}; export https_proxy=${http_proxy}\n", - " export HTTP_PROXY=${proxy}; export HTTPS_PROXY=${http_proxy}\n", - " export no_proxy=169.254.169.254; export NO_PROXY=169.254.169.254\n", - " echo -e \"export http_proxy=${proxy}; export https_proxy=${http_proxy}\nexport HTTP_PROXY=${proxy}; export HTTPS_PROXY=${http_proxy}\nexport no_proxy=169.254.169.254; export NO_PROXY=169.254.169.254\n\" >/tmp/proxy.sh\n", - " echo -e \"[Boto]\nproxy = ${proxy_host}\nproxy_port = ${proxy_port}\n\" >/etc/boto.cfg\n", - "else\n", - " proxy_args=\"\"\n", - " touch /tmp/proxy.sh\n", - "fi\n", - " _region=", - { - "Ref": "AWS::Region" - }, - "\n", - "s3_url=", - { - "Fn::FindInMap": [ - "Partition2Url", - { - "Ref": "AWS::Partition" - }, - "url" - ] - }, - "\n", - "if [ \"${custom_cookbook}\" != \"NONE\" ]; then\n", - " cookbook_url=${custom_cookbook}\n", - "else\n", - " cookbook_url=https://s3.${_region}.${s3_url}/${_region}-aws-parallelcluster/cookbooks/", - { - "Fn::FindInMap": [ - "PackagesVersions", - "default", - "cookbook" - ] - }, - ".tgz\n", - "fi\n", - "export PATH=/usr/local/sbin:/usr/local/bin:/sbin:/bin:/usr/sbin:/usr/bin:/opt/aws/bin\n", - "export parallelcluster_version=aws-parallelcluster-", - { - "Fn::FindInMap": [ - "PackagesVersions", - "default", - "parallelcluster" - ] - }, - "\n", - "export cookbook_version=", - { - "Fn::FindInMap": [ - "PackagesVersions", - "default", - "cookbook" - ] - }, - "\n", - "export chef_version=", - { - "Fn::FindInMap": [ - "PackagesVersions", - "default", - "chef" - ] - }, - "\n", - "export ridley_version=", - { - "Fn::FindInMap": [ - "PackagesVersions", - "default", - "ridley" - ] - }, - "\n", - "export berkshelf_version=", - { - "Fn::FindInMap": [ - "PackagesVersions", - "default", - "berkshelf" - ] - }, - "\n", - "if [ -f /opt/parallelcluster/.bootstrapped ]; then\n", - " installed_version=$(cat /opt/parallelcluster/.bootstrapped)\n", - " if [ \"${parallelcluster_version}\" != \"${installed_version}\" ]; then\n", - " bootstrap_instance\n", - " fi\n", - "else\n", - " bootstrap_instance\n", - "fi\n", - "if [ \"${custom_cookbook}\" != \"NONE\" ]; then\n", - " curl --retry 3 -v -L -o /etc/chef/aws-parallelcluster-cookbook.tgz -z \"$(cat /etc/chef/aws-parallelcluster-cookbook.tgz.date)\" ${cookbook_url}\n", - " vendor_cookbook\n", - "fi\n", - "cd /tmp\n", - "# Call CloudFormation\n", - "cfn-init ${proxy_args} -s ", - { - "Ref": "AWS::StackName" - }, - " -v -c default -r ComputeServerLaunchConfig --region ", - { - "Ref": "AWS::Region" - }, - " || error_exit 'Failed to run cfn-init. If --norollback was specified, check /var/log/cfn-init.log and /var/log/cloud-init-output.log.'\n", - "cfn-signal ${proxy_args} --exit-code=0 --reason=\"MasterServer setup complete\" --stack=", - { - "Ref": "AWS::StackName" - }, - " --resource=ComputeFleet --region=", - { - "Ref": "AWS::Region" - }, - "\n", - "# End of file\n", - "--==BOUNDARY==\n" - ] - ] - } - } - }, - "Metadata": { - "Comment": "AWS ParallelCluster Compute server", - "AWS::CloudFormation::Init": { - "configSets": { - "default": [ - "deployConfigFiles", - "chefPrepEnv", - "shellRunPreInstall", - "chefConfig", - "shellRunPostInstall", - "signalComputeReady" - ] - }, - "deployConfigFiles": { - "files": { - "/tmp/dna.json": { - "mode": "000644", - "owner": "root", - "group": "root", - "content": { - "cfncluster": { - "stack_name": { - "Ref": "AWS::StackName" - }, - "cfn_raid_parameters": { - "Ref": "RAIDOptions" - }, - "cfn_preinstall": { - "Ref": "PreInstallScript" - }, - "cfn_preinstall_args": { - "Ref": "PreInstallArgs" - }, - "cfn_postinstall": { - "Ref": "PostInstallScript" - }, - "cfn_postinstall_args": { - "Ref": "PostInstallArgs" - }, - "cfn_region": { - "Ref": "AWS::Region" - }, - "cfn_efs": { - "Fn::If": [ - "CreateEFSSubstack", - { - "Fn::GetAtt": [ - "EFSSubstack", - "Outputs.FileSystemId" - ] - }, - "" - ] - }, - "cfn_efs_shared_dir": { - "Ref": "EFSOptions" - }, - "cfn_fsx_fs_id": { - "Fn::If": [ - "CreateFSXSubstack", - { - "Fn::GetAtt": [ - "FSXSubstack", - "Outputs.FileSystemId" - ] - }, - "" - ] - }, - "cfn_fsx_options": { - "Ref": "FSXOptions" - }, - "cfn_scheduler": { - "Ref": "Scheduler" - }, - "cfn_scaledown_idletime": { - "Ref": "ScaleDownIdleTime" - }, - "cfn_encrypted_ephemeral": { - "Ref": "EncryptedEphemeral" - }, - "cfn_ephemeral_dir": { - "Ref": "EphemeralDir" - }, - "cfn_shared_dir": { - "Ref": "SharedDir" - }, - "cfn_proxy": { - "Ref": "ProxyServer" - }, - "cfn_sqs_queue": { - "Ref": "SQS" - }, - "cfn_master": { - "Fn::GetAtt": [ - "MasterServer", - "PrivateDnsName" - ] - }, - "cfn_node_type": "ComputeFleet", - "cfn_cluster_user": { - "Fn::FindInMap": [ - "OSFeatures", - { - "Ref": "BaseOS" - }, - "User" - ] - } - }, - "run_list": { - "Fn::If": [ - "UseCustomRunList", - { - "Ref": "CustomChefRunList" - }, - { - "Fn::Join": [ - "", - [ - "recipe[aws-parallelcluster::", - { - "Ref": "Scheduler" - }, - "_config]" - ] - ] - } - ] - } - } - }, - "/etc/chef/client.rb": { - "mode": "000644", - "owner": "root", - "group": "root", - "content": { - "Fn::Join": [ - "", - [ - "cookbook_path ['/etc/chef/cookbooks']" - ] - ] - } - }, - "/tmp/extra.json": { - "mode": "000644", - "owner": "root", - "group": "root", - "content": { - "Ref": "ExtraJson" - } - } - }, - "commands": { - "mkdir": { - "command": "mkdir -p /etc/chef/ohai/hints" - }, - "touch": { - "command": "touch /etc/chef/ohai/hints/ec2.json" - }, - "jq": { - "command": "jq --argfile f1 /tmp/dna.json --argfile f2 /tmp/extra.json -n '$f1 + $f2 | .cfncluster = $f1.cfncluster + $f2.cfncluster' > /etc/chef/dna.json || ( echo \"jq not installed\"; cp /tmp/dna.json /etc/chef/dna.json )" - } - } - }, - "chefPrepEnv": { - "commands": { - "chef": { - "command": "chef-client --local-mode --config /etc/chef/client.rb --log_level auto --force-formatter --no-color --chef-zero-port 8889 --json-attributes /etc/chef/dna.json --override-runlist aws-parallelcluster::_prep_env", - "cwd": "/etc/chef" - } - } - }, - "shellRunPreInstall": { - "commands": { - "runpreinstall": { - "command": "/opt/parallelcluster/scripts/fetch_and_run -preinstall" - } - } - }, - "chefConfig": { - "commands": { - "chef": { - "command": "chef-client --local-mode --config /etc/chef/client.rb --log_level auto --force-formatter --no-color --chef-zero-port 8889 --json-attributes /etc/chef/dna.json", - "cwd": "/etc/chef" - } - } - }, - "shellRunPostInstall": { - "commands": { - "runpostinstall": { - "command": "/opt/parallelcluster/scripts/fetch_and_run -postinstall" - } - } - }, - "signalComputeReady": { - "commands": { - "compute_ready": { - "command": "/opt/parallelcluster/scripts/compute_ready" - } - } - } - } - } - }, "ComputeServerLaunchTemplate": { "Type": "AWS::EC2::LaunchTemplate", "Properties": { @@ -3852,7 +3157,6 @@ } } }, - "Condition": "CreateLaunchTemplate", "Metadata": { "Comment": "AWS ParallelCluster Compute server", "AWS::CloudFormation::Init": { From 443edd665e5bf9c16d04bf168c87283259039fd2 Mon Sep 17 00:00:00 2001 From: Francesco De Martino Date: Fri, 19 Apr 2019 09:55:43 +0200 Subject: [PATCH 038/121] Integ tests: slurm - update dummy-node CPUs value in assertion Signed-off-by: Francesco De Martino --- tests/integration-tests/tests/schedulers/test_slurm.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tests/integration-tests/tests/schedulers/test_slurm.py b/tests/integration-tests/tests/schedulers/test_slurm.py index cd8f7322a7..5c3433241d 100644 --- a/tests/integration-tests/tests/schedulers/test_slurm.py +++ b/tests/integration-tests/tests/schedulers/test_slurm.py @@ -152,7 +152,11 @@ def _retrieve_slurm_dummy_nodes(remote_command_executor): def _assert_dummy_nodes(remote_command_executor, count): __tracebackhide__ = True dummy_nodes_config = _retrieve_slurm_dummy_nodes_from_config(remote_command_executor) - assert_that(dummy_nodes_config).is_equal_to("NodeName=dummy-compute[1-{0}] CPUs=2048 State=FUTURE".format(count)) + # For the moment the test is enabled only on c5.xlarge, hence hardcoding slots for simplicity + slots = 4 + assert_that(dummy_nodes_config).is_equal_to( + "NodeName=dummy-compute[1-{0}] CPUs={1} State=FUTURE".format(count, slots) + ) dummy_nodes_count = _retrieve_slurm_dummy_nodes(remote_command_executor) assert_that(dummy_nodes_count).is_equal_to(count) From 6db03dd3be972b9ae9a77dc36f4ce9658e7d2e09 Mon Sep 17 00:00:00 2001 From: Sean Smith Date: Thu, 18 Apr 2019 20:17:55 -0700 Subject: [PATCH 039/121] Remove IAM LaunchConfig Permissions Signed-off-by: Sean Smith --- docs/iam.rst | 3 --- 1 file changed, 3 deletions(-) diff --git a/docs/iam.rst b/docs/iam.rst index 40787131e3..7eb1a615c5 100644 --- a/docs/iam.rst +++ b/docs/iam.rst @@ -276,7 +276,6 @@ To: :: "Sid": "AutoScalingDescribe", "Action": [ "autoscaling:DescribeAutoScalingGroups", - "autoscaling:DescribeLaunchConfigurations", "autoscaling:DescribeAutoScalingInstances" ], "Effect": "Allow", @@ -286,7 +285,6 @@ To: :: "Sid": "AutoScalingModify", "Action": [ "autoscaling:CreateAutoScalingGroup", - "autoscaling:CreateLaunchConfiguration", "ec2:CreateLaunchTemplate", "ec2:ModifyLaunchTemplate", "ec2:DeleteLaunchTemplate", @@ -295,7 +293,6 @@ To: :: "autoscaling:PutNotificationConfiguration", "autoscaling:UpdateAutoScalingGroup", "autoscaling:PutScalingPolicy", - "autoscaling:DeleteLaunchConfiguration", "autoscaling:DescribeScalingActivities", "autoscaling:DeleteAutoScalingGroup", "autoscaling:DeletePolicy" From f0d01d55e77bb9d42a6642debc10338546719a7c Mon Sep 17 00:00:00 2001 From: Sean Smith Date: Fri, 19 Apr 2019 10:09:10 -0700 Subject: [PATCH 040/121] Revert "Remove IAM LaunchConfig Permissions" This reverts commit b47b0a7402d7be6d1e080618cb43a3d0d5543929. --- docs/iam.rst | 3 +++ 1 file changed, 3 insertions(+) diff --git a/docs/iam.rst b/docs/iam.rst index 7eb1a615c5..40787131e3 100644 --- a/docs/iam.rst +++ b/docs/iam.rst @@ -276,6 +276,7 @@ To: :: "Sid": "AutoScalingDescribe", "Action": [ "autoscaling:DescribeAutoScalingGroups", + "autoscaling:DescribeLaunchConfigurations", "autoscaling:DescribeAutoScalingInstances" ], "Effect": "Allow", @@ -285,6 +286,7 @@ To: :: "Sid": "AutoScalingModify", "Action": [ "autoscaling:CreateAutoScalingGroup", + "autoscaling:CreateLaunchConfiguration", "ec2:CreateLaunchTemplate", "ec2:ModifyLaunchTemplate", "ec2:DeleteLaunchTemplate", @@ -293,6 +295,7 @@ To: :: "autoscaling:PutNotificationConfiguration", "autoscaling:UpdateAutoScalingGroup", "autoscaling:PutScalingPolicy", + "autoscaling:DeleteLaunchConfiguration", "autoscaling:DescribeScalingActivities", "autoscaling:DeleteAutoScalingGroup", "autoscaling:DeletePolicy" From c5e10410bf803409240ebaaf3df5d3cdda9f8e05 Mon Sep 17 00:00:00 2001 From: Sean Smith Date: Fri, 19 Apr 2019 10:09:26 -0700 Subject: [PATCH 041/121] Revert "Remove Launch Configurations" This reverts commit 6df2e627a9a14a0133535afb57dd170d18bd1852. --- cloudformation/aws-parallelcluster.cfn.json | 714 +++++++++++++++++++- 1 file changed, 705 insertions(+), 9 deletions(-) diff --git a/cloudformation/aws-parallelcluster.cfn.json b/cloudformation/aws-parallelcluster.cfn.json index 89adc24458..2c4c92c8b8 100644 --- a/cloudformation/aws-parallelcluster.cfn.json +++ b/cloudformation/aws-parallelcluster.cfn.json @@ -1118,6 +1118,45 @@ "aws-us-gov" ] }, + "ChinaRegion": { + "Fn::Equals": [ + { + "Ref": "AWS::Partition" + }, + "aws-cn" + ] + }, + "CreateLaunchTemplate": { + "Fn::And": [ + { + "Fn::Not": [ + { + "Fn::Or": [ + { + "Condition": "ChinaRegion" + }, + { + "Condition": "GovCloudRegion" + } + ] + } + ] + }, + { + "Condition": "CreateComputeFleet" + } + ] + }, + "CreateLaunchConfig": { + "Fn::And": [ + { + "Condition": "ChinaRegion" + }, + { + "Condition": "CreateComputeFleet" + } + ] + }, "CreateComputeFleet": { "Fn::Not": [ { @@ -2584,16 +2623,35 @@ ] } ], + "LaunchConfigurationName": { + "Fn::If": [ + "CreateLaunchConfig", + { + "Ref": "ComputeServerLaunchConfig" + }, + { + "Ref": "AWS::NoValue" + } + ] + }, "LaunchTemplate": { - "LaunchTemplateId": { - "Ref": "ComputeServerLaunchTemplate" - }, - "Version": { - "Fn::GetAtt": [ - "ComputeServerLaunchTemplate", - "LatestVersionNumber" - ] - } + "Fn::If": [ + "CreateLaunchTemplate", + { + "LaunchTemplateId": { + "Ref": "ComputeServerLaunchTemplate" + }, + "Version": { + "Fn::GetAtt": [ + "ComputeServerLaunchTemplate", + "LatestVersionNumber" + ] + } + }, + { + "Ref": "AWS::NoValue" + } + ] }, "MinSize": { "Ref": "MinSize" @@ -2699,6 +2757,643 @@ }, "Condition": "CreateComputeFleet" }, + "ComputeServerLaunchConfig": { + "Type": "AWS::AutoScaling::LaunchConfiguration", + "Condition": "CreateLaunchConfig", + "Properties": { + "SecurityGroups": [ + { + "Fn::If": [ + "CreateSecurityGroups", + { + "Ref": "ComputeSecurityGroup" + }, + { + "Ref": "AWS::NoValue" + } + ] + }, + { + "Fn::If": [ + "AddAdditionalSG", + { + "Ref": "AdditionalSG" + }, + { + "Ref": "AWS::NoValue" + } + ] + }, + { + "Fn::If": [ + "UseExistingSecurityGroup", + { + "Ref": "VPCSecurityGroupId" + }, + { + "Ref": "AWS::NoValue" + } + ] + } + ], + "AssociatePublicIpAddress": { + "Fn::If": [ + "ComputePublicIps", + true, + false + ] + }, + "InstanceType": { + "Ref": "ComputeInstanceType" + }, + "KeyName": { + "Ref": "KeyName" + }, + "IamInstanceProfile": { + "Ref": "RootInstanceProfile" + }, + "SpotPrice": { + "Fn::If": [ + "UseSpotInstances", + { + "Ref": "SpotPrice" + }, + { + "Ref": "AWS::NoValue" + } + ] + }, + "ImageId": { + "Fn::If": [ + "UseCustomAMI", + { + "Ref": "CustomAMI" + }, + { + "Fn::FindInMap": [ + "AWSRegionOS2AMI", + { + "Ref": "AWS::Region" + }, + { + "Ref": "BaseOS" + } + ] + } + ] + }, + "InstanceMonitoring": false, + "BlockDeviceMappings": [ + { + "DeviceName": "/dev/xvdba", + "VirtualName": "ephemeral0" + }, + { + "DeviceName": "/dev/xvdbb", + "VirtualName": "ephemeral1" + }, + { + "DeviceName": "/dev/xvdbc", + "VirtualName": "ephemeral2" + }, + { + "DeviceName": "/dev/xvdbd", + "VirtualName": "ephemeral3" + }, + { + "DeviceName": "/dev/xvdbe", + "VirtualName": "ephemeral4" + }, + { + "DeviceName": "/dev/xvdbf", + "VirtualName": "ephemeral5" + }, + { + "DeviceName": "/dev/xvdbg", + "VirtualName": "ephemeral6" + }, + { + "DeviceName": "/dev/xvdbh", + "VirtualName": "ephemeral7" + }, + { + "DeviceName": "/dev/xvdbi", + "VirtualName": "ephemeral8" + }, + { + "DeviceName": "/dev/xvdbj", + "VirtualName": "ephemeral9" + }, + { + "DeviceName": "/dev/xvdbk", + "VirtualName": "ephemeral10" + }, + { + "DeviceName": "/dev/xvdbl", + "VirtualName": "ephemeral11" + }, + { + "DeviceName": "/dev/xvdbm", + "VirtualName": "ephemeral12" + }, + { + "DeviceName": "/dev/xvdbn", + "VirtualName": "ephemeral13" + }, + { + "DeviceName": "/dev/xvdbo", + "VirtualName": "ephemeral14" + }, + { + "DeviceName": "/dev/xvdbp", + "VirtualName": "ephemeral15" + }, + { + "DeviceName": "/dev/xvdbq", + "VirtualName": "ephemeral16" + }, + { + "DeviceName": "/dev/xvdbr", + "VirtualName": "ephemeral17" + }, + { + "DeviceName": "/dev/xvdbs", + "VirtualName": "ephemeral18" + }, + { + "DeviceName": "/dev/xvdbt", + "VirtualName": "ephemeral19" + }, + { + "DeviceName": "/dev/xvdbu", + "VirtualName": "ephemeral20" + }, + { + "DeviceName": "/dev/xvdbv", + "VirtualName": "ephemeral21" + }, + { + "DeviceName": "/dev/xvdbw", + "VirtualName": "ephemeral22" + }, + { + "DeviceName": "/dev/xvdbx", + "VirtualName": "ephemeral23" + }, + { + "DeviceName": { + "Fn::FindInMap": [ + "OSFeatures", + { + "Ref": "BaseOS" + }, + "RootDevice" + ] + }, + "Ebs": { + "VolumeSize": { + "Ref": "ComputeRootVolumeSize" + }, + "VolumeType": "gp2" + } + } + ], + "PlacementTenancy": { + "Fn::If": [ + "UseDedicatedTenancy", + { + "Ref": "Tenancy" + }, + { + "Ref": "AWS::NoValue" + } + ] + }, + "UserData": { + "Fn::Base64": { + "Fn::Join": [ + "", + [ + "Content-Type: multipart/mixed; boundary=\"==BOUNDARY==\"\n", + "MIME-Version: 1.0\n\n", + "--==BOUNDARY==\n", + "Content-Type: text/cloud-config; charset=\"us-ascii\"\n", + "MIME-Version: 1.0\n\n", + "#cloud-config:\n", + "runcmd:\n", + " - [ sh, -c, 'which yum && echo \"proxy=", + { + "Fn::If": [ + "UseProxy", + { + "Ref": "ProxyServer" + }, + "_none_" + ] + }, + "\" >> /etc/yum.conf || echo \"Not yum system\"' ]\n", + " - [ sh, -c, 'which apt-get && echo \"Acquire::http::Proxy \\\"", + { + "Fn::If": [ + "UseProxy", + { + "Ref": "ProxyServer" + }, + "false" + ] + }, + "\\\";\" >> /etc/apt/apt.conf || echo \"Not apt system\"' ]\n", + "--==BOUNDARY==\n", + "Content-Type: text/x-shellscript; charset=\"us-ascii\"\n", + "MIME-Version: 1.0\n\n", + "#!/bin/bash -x\n\n", + "function error_exit\n", + "{\n", + " cfn-signal ${proxy_args} --exit-code=1 --reason=\"$1\" --stack=", + { + "Ref": "AWS::StackName" + }, + " --resource=ComputeFleet --region=", + { + "Ref": "AWS::Region" + }, + "\n", + " exit 1\n", + "}\n", + "function vendor_cookbook\n", + "{\n", + " mkdir /tmp/cookbooks\n", + " cd /tmp/cookbooks\n", + " tar -xzf /etc/chef/aws-parallelcluster-cookbook.tgz\n", + " HOME_BAK=${HOME}\n", + " export HOME=\"/tmp\"\n", + " . /tmp/proxy.sh; for d in `ls /tmp/cookbooks`; do cd /tmp/cookbooks/$d;LANG=en_US.UTF-8 /opt/chef/embedded/bin/berks vendor /etc/chef/cookbooks --delete; done;\n", + " export HOME=${HOME_BAK}\n", + "}\n", + "function bootstrap_instance\n", + "{\n", + " which yum 2>/dev/null; yum=$?\n", + " which apt-get 2>/dev/null; apt=$?\n", + " if [ \"${yum}\" == \"0\" ]; then\n", + " yum -y groupinstall development && yum -y install curl wget jq\n", + " fi\n", + " if [ \"${apt}\" == \"0\" ]; then\n", + " apt-cache search build-essential; apt-get clean; apt-get update; apt-get -y install build-essential curl wget jq\n", + " fi\n", + " [[ ${_region} =~ ^cn- ]] && s3_url=\"cn-north-1.amazonaws.com.cn\"", + " which cfn-init 2>/dev/null || ( curl -s -L -o /tmp/aws-cfn-bootstrap-latest.tar.gz https://s3.${s3_url}/cloudformation-examples/aws-cfn-bootstrap-latest.tar.gz; easy_install -U /tmp/aws-cfn-bootstrap-latest.tar.gz)\n", + " mkdir -p /etc/chef && chown -R root:root /etc/chef\n", + " curl --retry 3 -L https://www.chef.io/chef/install.sh | bash -s -- -v ${chef_version}\n", + " /opt/chef/embedded/bin/gem install --no-rdoc --no-ri ridley:${ridley_version} berkshelf:${berkshelf_version}\n", + " curl --retry 3 -s -L -o /etc/chef/aws-parallelcluster-cookbook.tgz ${cookbook_url}\n", + " curl --retry 3 -s -L -o /etc/chef/aws-parallelcluster-cookbook.tgz.date ${cookbook_url}.date\n", + " curl --retry 3 -s -L -o /etc/chef/aws-parallelcluster-cookbook.tgz.md5 ${cookbook_url}.md5\n", + " vendor_cookbook\n", + " mkdir /opt/parallelcluster && echo ${parallelcluster_version} | tee /opt/parallelcluster/.bootstrapped\n", + "}\n", + "proxy=", + { + "Ref": "ProxyServer" + }, + "\n", + "custom_cookbook=", + { + "Ref": "CustomChefCookbook" + }, + "\n", + "if [ \"${proxy}\" != \"NONE\" ]; then\n", + " proxy_args=\"--http-proxy=${proxy} --https-proxy=${proxy}\"\n", + " proxy_host=$(echo \"${proxy}\" | awk -F/ '{print $3}' | cut -d: -f1)\n", + " proxy_port=$(echo \"${proxy}\" | awk -F/ '{print $3}' | cut -d: -f2)\n", + " export http_proxy=${proxy}; export https_proxy=${http_proxy}\n", + " export HTTP_PROXY=${proxy}; export HTTPS_PROXY=${http_proxy}\n", + " export no_proxy=169.254.169.254; export NO_PROXY=169.254.169.254\n", + " echo -e \"export http_proxy=${proxy}; export https_proxy=${http_proxy}\nexport HTTP_PROXY=${proxy}; export HTTPS_PROXY=${http_proxy}\nexport no_proxy=169.254.169.254; export NO_PROXY=169.254.169.254\n\" >/tmp/proxy.sh\n", + " echo -e \"[Boto]\nproxy = ${proxy_host}\nproxy_port = ${proxy_port}\n\" >/etc/boto.cfg\n", + "else\n", + " proxy_args=\"\"\n", + " touch /tmp/proxy.sh\n", + "fi\n", + " _region=", + { + "Ref": "AWS::Region" + }, + "\n", + "s3_url=", + { + "Fn::FindInMap": [ + "Partition2Url", + { + "Ref": "AWS::Partition" + }, + "url" + ] + }, + "\n", + "if [ \"${custom_cookbook}\" != \"NONE\" ]; then\n", + " cookbook_url=${custom_cookbook}\n", + "else\n", + " cookbook_url=https://s3.${_region}.${s3_url}/${_region}-aws-parallelcluster/cookbooks/", + { + "Fn::FindInMap": [ + "PackagesVersions", + "default", + "cookbook" + ] + }, + ".tgz\n", + "fi\n", + "export PATH=/usr/local/sbin:/usr/local/bin:/sbin:/bin:/usr/sbin:/usr/bin:/opt/aws/bin\n", + "export parallelcluster_version=aws-parallelcluster-", + { + "Fn::FindInMap": [ + "PackagesVersions", + "default", + "parallelcluster" + ] + }, + "\n", + "export cookbook_version=", + { + "Fn::FindInMap": [ + "PackagesVersions", + "default", + "cookbook" + ] + }, + "\n", + "export chef_version=", + { + "Fn::FindInMap": [ + "PackagesVersions", + "default", + "chef" + ] + }, + "\n", + "export ridley_version=", + { + "Fn::FindInMap": [ + "PackagesVersions", + "default", + "ridley" + ] + }, + "\n", + "export berkshelf_version=", + { + "Fn::FindInMap": [ + "PackagesVersions", + "default", + "berkshelf" + ] + }, + "\n", + "if [ -f /opt/parallelcluster/.bootstrapped ]; then\n", + " installed_version=$(cat /opt/parallelcluster/.bootstrapped)\n", + " if [ \"${parallelcluster_version}\" != \"${installed_version}\" ]; then\n", + " bootstrap_instance\n", + " fi\n", + "else\n", + " bootstrap_instance\n", + "fi\n", + "if [ \"${custom_cookbook}\" != \"NONE\" ]; then\n", + " curl --retry 3 -v -L -o /etc/chef/aws-parallelcluster-cookbook.tgz -z \"$(cat /etc/chef/aws-parallelcluster-cookbook.tgz.date)\" ${cookbook_url}\n", + " vendor_cookbook\n", + "fi\n", + "cd /tmp\n", + "# Call CloudFormation\n", + "cfn-init ${proxy_args} -s ", + { + "Ref": "AWS::StackName" + }, + " -v -c default -r ComputeServerLaunchConfig --region ", + { + "Ref": "AWS::Region" + }, + " || error_exit 'Failed to run cfn-init. If --norollback was specified, check /var/log/cfn-init.log and /var/log/cloud-init-output.log.'\n", + "cfn-signal ${proxy_args} --exit-code=0 --reason=\"MasterServer setup complete\" --stack=", + { + "Ref": "AWS::StackName" + }, + " --resource=ComputeFleet --region=", + { + "Ref": "AWS::Region" + }, + "\n", + "# End of file\n", + "--==BOUNDARY==\n" + ] + ] + } + } + }, + "Metadata": { + "Comment": "AWS ParallelCluster Compute server", + "AWS::CloudFormation::Init": { + "configSets": { + "default": [ + "deployConfigFiles", + "chefPrepEnv", + "shellRunPreInstall", + "chefConfig", + "shellRunPostInstall", + "signalComputeReady" + ] + }, + "deployConfigFiles": { + "files": { + "/tmp/dna.json": { + "mode": "000644", + "owner": "root", + "group": "root", + "content": { + "cfncluster": { + "stack_name": { + "Ref": "AWS::StackName" + }, + "cfn_raid_parameters": { + "Ref": "RAIDOptions" + }, + "cfn_preinstall": { + "Ref": "PreInstallScript" + }, + "cfn_preinstall_args": { + "Ref": "PreInstallArgs" + }, + "cfn_postinstall": { + "Ref": "PostInstallScript" + }, + "cfn_postinstall_args": { + "Ref": "PostInstallArgs" + }, + "cfn_region": { + "Ref": "AWS::Region" + }, + "cfn_efs": { + "Fn::If": [ + "CreateEFSSubstack", + { + "Fn::GetAtt": [ + "EFSSubstack", + "Outputs.FileSystemId" + ] + }, + "" + ] + }, + "cfn_efs_shared_dir": { + "Ref": "EFSOptions" + }, + "cfn_fsx_fs_id": { + "Fn::If": [ + "CreateFSXSubstack", + { + "Fn::GetAtt": [ + "FSXSubstack", + "Outputs.FileSystemId" + ] + }, + "" + ] + }, + "cfn_fsx_options": { + "Ref": "FSXOptions" + }, + "cfn_scheduler": { + "Ref": "Scheduler" + }, + "cfn_scaledown_idletime": { + "Ref": "ScaleDownIdleTime" + }, + "cfn_encrypted_ephemeral": { + "Ref": "EncryptedEphemeral" + }, + "cfn_ephemeral_dir": { + "Ref": "EphemeralDir" + }, + "cfn_shared_dir": { + "Ref": "SharedDir" + }, + "cfn_proxy": { + "Ref": "ProxyServer" + }, + "cfn_sqs_queue": { + "Ref": "SQS" + }, + "cfn_master": { + "Fn::GetAtt": [ + "MasterServer", + "PrivateDnsName" + ] + }, + "cfn_node_type": "ComputeFleet", + "cfn_cluster_user": { + "Fn::FindInMap": [ + "OSFeatures", + { + "Ref": "BaseOS" + }, + "User" + ] + } + }, + "run_list": { + "Fn::If": [ + "UseCustomRunList", + { + "Ref": "CustomChefRunList" + }, + { + "Fn::Join": [ + "", + [ + "recipe[aws-parallelcluster::", + { + "Ref": "Scheduler" + }, + "_config]" + ] + ] + } + ] + } + } + }, + "/etc/chef/client.rb": { + "mode": "000644", + "owner": "root", + "group": "root", + "content": { + "Fn::Join": [ + "", + [ + "cookbook_path ['/etc/chef/cookbooks']" + ] + ] + } + }, + "/tmp/extra.json": { + "mode": "000644", + "owner": "root", + "group": "root", + "content": { + "Ref": "ExtraJson" + } + } + }, + "commands": { + "mkdir": { + "command": "mkdir -p /etc/chef/ohai/hints" + }, + "touch": { + "command": "touch /etc/chef/ohai/hints/ec2.json" + }, + "jq": { + "command": "jq --argfile f1 /tmp/dna.json --argfile f2 /tmp/extra.json -n '$f1 + $f2 | .cfncluster = $f1.cfncluster + $f2.cfncluster' > /etc/chef/dna.json || ( echo \"jq not installed\"; cp /tmp/dna.json /etc/chef/dna.json )" + } + } + }, + "chefPrepEnv": { + "commands": { + "chef": { + "command": "chef-client --local-mode --config /etc/chef/client.rb --log_level auto --force-formatter --no-color --chef-zero-port 8889 --json-attributes /etc/chef/dna.json --override-runlist aws-parallelcluster::_prep_env", + "cwd": "/etc/chef" + } + } + }, + "shellRunPreInstall": { + "commands": { + "runpreinstall": { + "command": "/opt/parallelcluster/scripts/fetch_and_run -preinstall" + } + } + }, + "chefConfig": { + "commands": { + "chef": { + "command": "chef-client --local-mode --config /etc/chef/client.rb --log_level auto --force-formatter --no-color --chef-zero-port 8889 --json-attributes /etc/chef/dna.json", + "cwd": "/etc/chef" + } + } + }, + "shellRunPostInstall": { + "commands": { + "runpostinstall": { + "command": "/opt/parallelcluster/scripts/fetch_and_run -postinstall" + } + } + }, + "signalComputeReady": { + "commands": { + "compute_ready": { + "command": "/opt/parallelcluster/scripts/compute_ready" + } + } + } + } + } + }, "ComputeServerLaunchTemplate": { "Type": "AWS::EC2::LaunchTemplate", "Properties": { @@ -3157,6 +3852,7 @@ } } }, + "Condition": "CreateLaunchTemplate", "Metadata": { "Comment": "AWS ParallelCluster Compute server", "AWS::CloudFormation::Init": { From 0cbc384363fcdc524cd7989fa1936a4ed261b57b Mon Sep 17 00:00:00 2001 From: Sean Smith Date: Fri, 19 Apr 2019 10:09:43 -0700 Subject: [PATCH 042/121] Revert "Use LaunchTemplates in the GovCloud region" This reverts commit 4c2ac3e7941301ec7e481ebf7d0b7854b8c445f0. --- cloudformation/aws-parallelcluster.cfn.json | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/cloudformation/aws-parallelcluster.cfn.json b/cloudformation/aws-parallelcluster.cfn.json index 2c4c92c8b8..4a75856a80 100644 --- a/cloudformation/aws-parallelcluster.cfn.json +++ b/cloudformation/aws-parallelcluster.cfn.json @@ -1150,7 +1150,14 @@ "CreateLaunchConfig": { "Fn::And": [ { - "Condition": "ChinaRegion" + "Fn::Or": [ + { + "Condition": "ChinaRegion" + }, + { + "Condition": "GovCloudRegion" + } + ] }, { "Condition": "CreateComputeFleet" From ab3ff81393822b5b29b91b3bc42a1906d33b065f Mon Sep 17 00:00:00 2001 From: Sean Smith Date: Fri, 19 Apr 2019 12:02:02 -0700 Subject: [PATCH 043/121] Add LaunchConfig back to us-gov-east-1 LaunchTemplates are not supported in us-gov-east-1 :( Signed-off-by: Sean Smith --- cloudformation/aws-parallelcluster.cfn.json | 24 ++++++--------------- 1 file changed, 6 insertions(+), 18 deletions(-) diff --git a/cloudformation/aws-parallelcluster.cfn.json b/cloudformation/aws-parallelcluster.cfn.json index 4a75856a80..c4a7e1b362 100644 --- a/cloudformation/aws-parallelcluster.cfn.json +++ b/cloudformation/aws-parallelcluster.cfn.json @@ -1118,26 +1118,16 @@ "aws-us-gov" ] }, - "ChinaRegion": { - "Fn::Equals": [ - { - "Ref": "AWS::Partition" - }, - "aws-cn" - ] - }, "CreateLaunchTemplate": { "Fn::And": [ { "Fn::Not": [ { - "Fn::Or": [ + "Fn::Equals": [ { - "Condition": "ChinaRegion" + "Ref": "AWS::Region" }, - { - "Condition": "GovCloudRegion" - } + "us-gov-east-1" ] } ] @@ -1150,13 +1140,11 @@ "CreateLaunchConfig": { "Fn::And": [ { - "Fn::Or": [ + "Fn::Equals": [ { - "Condition": "ChinaRegion" + "Ref": "AWS::Region" }, - { - "Condition": "GovCloudRegion" - } + "us-gov-east-1" ] }, { From 262b2869b1f8e081fd4fead6ec501f1f96d4831c Mon Sep 17 00:00:00 2001 From: Sean Smith Date: Mon, 22 Apr 2019 13:55:48 -0700 Subject: [PATCH 044/121] Change OrderedDict Initialization Signed-off-by: Sean Smith --- cli/pcluster/cfnconfig.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/cli/pcluster/cfnconfig.py b/cli/pcluster/cfnconfig.py index 3cfc145787..c5f43dda54 100644 --- a/cli/pcluster/cfnconfig.py +++ b/cli/pcluster/cfnconfig.py @@ -460,10 +460,13 @@ def __init_size_parameters(self): self.parameters["MaxSize"] = "10" size_parameters = OrderedDict( - initial_queue_size=("InitialQueueSize", None), - maintain_initial_size=("MaintainInitialSize", None), - max_queue_size=("MaxQueueSize", None), + [ + ("initial_queue_size", ("InitialQueueSize", None)), + ("maintain_initial_size", ("MaintainInitialSize", None)), + ("max_queue_size", ("MaxQueueSize", None)), + ] ) + for key in size_parameters: try: __temp__ = self.__config.get(self.__cluster_section, key) From 495710de20c79c245b1e1bca34f71f9ba6863b0e Mon Sep 17 00:00:00 2001 From: Francesco De Martino Date: Tue, 23 Apr 2019 19:20:37 +0200 Subject: [PATCH 045/121] Revert "integ tests: disabling tests for centos7-torque until bug is solved" This reverts commit 52b426415a19447874443aca39b118d87acc3a7d. Signed-off-by: Francesco De Martino --- tests/integration-tests/tests/test_scaling.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/integration-tests/tests/test_scaling.py b/tests/integration-tests/tests/test_scaling.py index 568663460d..d3d54504c6 100644 --- a/tests/integration-tests/tests/test_scaling.py +++ b/tests/integration-tests/tests/test_scaling.py @@ -23,7 +23,6 @@ @pytest.mark.skip_schedulers(["awsbatch"]) @pytest.mark.skip_dimensions("cn-northwest-1", "*", "ubuntu1404", "*") # aws-cfn-bootstrap missing in Ningxia region @pytest.mark.skip_dimensions("cn-northwest-1", "*", "ubuntu1604", "*") # aws-cfn-bootstrap missing in Ningxia region -@pytest.mark.skip_dimensions("*", "*", "centos7", "torque") # https://github.com/aws/aws-parallelcluster/issues/875 @pytest.mark.usefixtures("region", "os", "instance") def test_multiple_jobs_submission(scheduler, region, pcluster_config_reader, clusters_factory, test_datadir): scaledown_idletime = 4 From 47d233824653c061b84dc4a17b8d0f50c8e50384 Mon Sep 17 00:00:00 2001 From: Sean Smith Date: Thu, 18 Apr 2019 21:15:03 -0700 Subject: [PATCH 046/121] Remove AWS Credentials from ParallelCluster Config For a better security posture, we're removing AWS credentials from the parallelcluster config file. Credentials can be setup from the aws cli, for example: ```bash $ aws configure AWS Access Key ID [None]: AKIAI44QH8DHBEXAMPLE AWS Secret Access Key [None]: je7MtGbClwBF/2Zp9Utk/h3yCo8nvbEXAMPLEKEY Default region name [None]: us-east-1 Default output format [None]: text ``` See https://docs.aws.amazon.com/cli/latest/userguide/cli-chap-configure.html Signed-off-by: Sean Smith --- README.rst | 2 -- cli/pcluster/easyconfig.py | 49 +++++++++++------------------------- cli/pcluster/examples/config | 7 ------ docs/configuration.rst | 12 +++------ docs/getting_started.rst | 22 +++++++++------- 5 files changed, 30 insertions(+), 62 deletions(-) diff --git a/README.rst b/README.rst index b058b60ee5..583643ad4d 100644 --- a/README.rst +++ b/README.rst @@ -41,8 +41,6 @@ Then, run pcluster configure: $ pcluster configure Cluster Template [default]: - AWS Access Key ID []: - AWS Secret Access Key ID []: Acceptable Values for AWS Region ID: ap-south-1 ... diff --git a/cli/pcluster/easyconfig.py b/cli/pcluster/easyconfig.py index b89ab4f346..2af4d7a527 100644 --- a/cli/pcluster/easyconfig.py +++ b/cli/pcluster/easyconfig.py @@ -40,6 +40,7 @@ def wrapper(*args, **kwargs): except (BotoCoreError, ClientError) as e: print("Failed with error: %s" % e) print("Hint: please check your AWS credentials.") + print("Run `aws configure` or set the credentials as environment variables.") sys.exit(1) return wrapper @@ -80,7 +81,8 @@ def get_regions(): return [region.get("RegionName") for region in regions if region.get("RegionName") not in unsupported_regions] -def ec2_conn(aws_access_key_id, aws_secret_access_key, aws_region_name): +@handle_client_exception +def ec2_conn(aws_region_name): if aws_region_name: region = aws_region_name elif os.environ.get("AWS_DEFAULT_REGION"): @@ -88,15 +90,13 @@ def ec2_conn(aws_access_key_id, aws_secret_access_key, aws_region_name): else: region = "us-east-1" - ec2 = boto3.client( - "ec2", region_name=region, aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key - ) + ec2 = boto3.client("ec2", region_name=region) return ec2 @handle_client_exception -def list_keys(aws_access_key_id, aws_secret_access_key, aws_region_name): - conn = ec2_conn(aws_access_key_id, aws_secret_access_key, aws_region_name) +def list_keys(aws_region_name): + conn = ec2_conn(aws_region_name) keypairs = conn.describe_key_pairs() keynames = [] for key in keypairs.get("KeyPairs"): @@ -111,8 +111,8 @@ def list_keys(aws_access_key_id, aws_secret_access_key, aws_region_name): @handle_client_exception -def list_vpcs(aws_access_key_id, aws_secret_access_key, aws_region_name): - conn = ec2_conn(aws_access_key_id, aws_secret_access_key, aws_region_name) +def list_vpcs(aws_region_name): + conn = ec2_conn(aws_region_name) vpcs = conn.describe_vpcs() vpcids = [] for vpc in vpcs.get("Vpcs"): @@ -127,8 +127,8 @@ def list_vpcs(aws_access_key_id, aws_secret_access_key, aws_region_name): @handle_client_exception -def list_subnets(aws_access_key_id, aws_secret_access_key, aws_region_name, vpc_id): - conn = ec2_conn(aws_access_key_id, aws_secret_access_key, aws_region_name) +def list_subnets(aws_region_name, vpc_id): + conn = ec2_conn(aws_region_name) subnets = conn.describe_subnets(Filters=[{"Name": "vpcId", "Values": [vpc_id]}]) subnetids = [] for subnet in subnets.get("Subnets"): @@ -161,22 +161,6 @@ def configure(args): # noqa: C901 FIXME!!! "Cluster Template", config.get("global", "cluster_template") if config.has_option("global", "cluster_template") else "default", ) - aws_access_key_id = prompt( - "AWS Access Key ID", - config.get("aws", "aws_access_key_id") if config.has_option("aws", "aws_access_key_id") else None, - True, - ) - aws_secret_access_key = prompt( - "AWS Secret Access Key ID", - config.get("aws", "aws_secret_access_key") if config.has_option("aws", "aws_secret_access_key") else None, - True, - ) - if not aws_access_key_id or not aws_secret_access_key: - print( - "You chose not to configure aws credentials in parallelcluster config file.\n" - "Please make sure you export a valid AWS_PROFILE or you have them exported in " - "the AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY environment variables." - ) # Use built in boto regions as an available option aws_region_name = prompt( @@ -198,13 +182,13 @@ def configure(args): # noqa: C901 FIXME!!! config.get("cluster " + cluster_template, "key_name") if config.has_option("cluster " + cluster_template, "key_name") else None, - options=list_keys(aws_access_key_id, aws_secret_access_key, aws_region_name), + options=list_keys(aws_region_name), check_validity=True, ) vpc_id = prompt( "VPC ID", config.get("vpc " + vpcname, "vpc_id") if config.has_option("vpc " + vpcname, "vpc_id") else None, - options=list_vpcs(aws_access_key_id, aws_secret_access_key, aws_region_name), + options=list_vpcs(aws_region_name), check_validity=True, ) master_subnet_id = prompt( @@ -212,7 +196,7 @@ def configure(args): # noqa: C901 FIXME!!! config.get("vpc " + vpcname, "master_subnet_id") if config.has_option("vpc " + vpcname, "master_subnet_id") else None, - options=list_subnets(aws_access_key_id, aws_secret_access_key, aws_region_name, vpc_id), + options=list_subnets(aws_region_name, vpc_id), check_validity=True, ) @@ -223,12 +207,7 @@ def configure(args): # noqa: C901 FIXME!!! "update_check": "true", "sanity_check": "true", } - s_aws = { - "__name__": "aws", - "aws_access_key_id": aws_access_key_id, - "aws_secret_access_key": aws_secret_access_key, - "aws_region_name": aws_region_name, - } + s_aws = {"__name__": "aws", "aws_region_name": aws_region_name} s_aliases = {"__name__": "aliases", "ssh": "ssh {CFN_USER}@{MASTER_IP} {ARGS}"} s_cluster = {"__name__": "cluster " + cluster_template, "key_name": key_name, "vpc_settings": vpcname} s_vpc = {"__name__": "vpc " + vpcname, "vpc_id": vpc_id, "master_subnet_id": master_subnet_id} diff --git a/cli/pcluster/examples/config b/cli/pcluster/examples/config index fd50b2cbec..6da6acbdba 100644 --- a/cli/pcluster/examples/config +++ b/cli/pcluster/examples/config @@ -8,13 +8,6 @@ update_check = true sanity_check = true [aws] -# This is the AWS credentials section (required). -# These settings apply to all clusters -# replace these with your AWS keys -# If not defined, boto will attempt to use a) environment -# or b) EC2 IAM role. -#aws_access_key_id = #your_aws_access_key_id -#aws_secret_access_key = #your_secret_access_key # Uncomment to specify a different Amazon AWS region (OPTIONAL) # (Defaults to us-east-1 if not defined in environment or below) #aws_region_name = #region diff --git a/docs/configuration.rst b/docs/configuration.rst index 4a10f6e0e7..1426680c8c 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -55,16 +55,10 @@ Attempt to validate the existence of the resources defined in parameters. :: aws ^^^ -AWS credentials/region section. +AWS Region section. -These settings apply to all clusters and are REQUIRED. - -For security purposes, AWS highly recommends using the environment, EC2 IAM Roles, or the -`AWS CLI `_ to store credentials rather than saving into the AWS ParallelCluster config file. :: - - [aws] - aws_access_key_id = #your_aws_access_key_id - aws_secret_access_key = #your_secret_access_key +To store credentials, you can use environment variables, IAM roles, or the preferred way, the +`AWS CLI `_ :: # Defaults to us-east-1 if not defined in environment or below aws_region_name = #region diff --git a/docs/getting_started.rst b/docs/getting_started.rst index ccacf5bbdb..b351bbb76c 100644 --- a/docs/getting_started.rst +++ b/docs/getting_started.rst @@ -65,6 +65,16 @@ was originally installed: Configuring AWS ParallelCluster =============================== +First you'll need to setup your IAM credentials, see `AWS CLI `_. +for more information. + +:: + $ aws configure + AWS Access Key ID [None]: AKIAIOSFODNN7EXAMPLE + AWS Secret Access Key [None]: wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY + Default region name [us-east-1]: us-east-1 + Default output format [None]: + Once installed you will need to setup some initial config. The easiest way to do this is below: :: @@ -79,15 +89,6 @@ cluster from. Cluster Template [mycluster]: -Next, you will be prompted for your AWS Access & Secret Keys. Enter the keys for an IAM user with administrative -privileges. -These can also be read from your environment variables or the AWS CLI config. - -:: - - AWS Access Key ID []: - AWS Secret Access Key ID []: - Now, you will be presented with a list of valid AWS region identifiers. Choose the region in which you'd like your cluster to run. @@ -234,3 +235,6 @@ to allow Inbound connection to the port 80 from your Public IP. .. spelling:: aws + wJalrXUtnFEMI + MDENG + bPxRfiCYEXAMPLEKEY From 31aa227c1ca5cc9d0468e91f6d0b52a7875e3c5f Mon Sep 17 00:00:00 2001 From: Francesco De Martino Date: Fri, 26 Apr 2019 14:37:18 +0200 Subject: [PATCH 047/121] integ tests: skipping ubuntu tests in cn-northwest-1 globally Due to aws-cfn-bootstrap missing in Ningxia region Signed-off-by: Francesco De Martino --- tests/integration-tests/conftest_markers.py | 2 ++ tests/integration-tests/tests/test_scaling.py | 2 -- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/integration-tests/conftest_markers.py b/tests/integration-tests/conftest_markers.py index 5e7115abba..3b1a1f7192 100644 --- a/tests/integration-tests/conftest_markers.py +++ b/tests/integration-tests/conftest_markers.py @@ -26,6 +26,8 @@ ("us-gov-west-1", "*", "*", "awsbatch"), ("us-gov-east-1", "*", "*", "awsbatch"), ("us-gov-east-1", "*", "c4.xlarge", "*"), + ("cn-northwest-1", "*", "ubuntu1404", "*"), # aws-cfn-bootstrap missing in Ningxia region + ("cn-northwest-1", "*", "ubuntu1604", "*"), # aws-cfn-bootstrap missing in Ningxia region ] diff --git a/tests/integration-tests/tests/test_scaling.py b/tests/integration-tests/tests/test_scaling.py index d3d54504c6..24e4d2d218 100644 --- a/tests/integration-tests/tests/test_scaling.py +++ b/tests/integration-tests/tests/test_scaling.py @@ -21,8 +21,6 @@ @pytest.mark.skip_schedulers(["awsbatch"]) -@pytest.mark.skip_dimensions("cn-northwest-1", "*", "ubuntu1404", "*") # aws-cfn-bootstrap missing in Ningxia region -@pytest.mark.skip_dimensions("cn-northwest-1", "*", "ubuntu1604", "*") # aws-cfn-bootstrap missing in Ningxia region @pytest.mark.usefixtures("region", "os", "instance") def test_multiple_jobs_submission(scheduler, region, pcluster_config_reader, clusters_factory, test_datadir): scaledown_idletime = 4 From bc2cf802d198e9722083f20a9a23befc4f60ab9b Mon Sep 17 00:00:00 2001 From: Francesco De Martino Date: Fri, 26 Apr 2019 16:48:25 +0200 Subject: [PATCH 048/121] docs: add example for s3 resources Signed-off-by: Francesco De Martino --- docs/s3_resources.rst | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/docs/s3_resources.rst b/docs/s3_resources.rst index a366c2c09d..629b065a8c 100644 --- a/docs/s3_resources.rst +++ b/docs/s3_resources.rst @@ -31,3 +31,9 @@ This next example gives you read access to the bucket. This does **not** let you :: s3_read_resource = arn:aws:s3:::my_corporate_bucket + +This last example gives you read access to the bucket and to the items stored in the bucket. + +:: + + s3_read_resource = arn:aws:s3:::my_corporate_bucket* From 87137efc794fd8761da16b22f39246d3a6c74177 Mon Sep 17 00:00:00 2001 From: ParallelCluster AMI bot Date: Mon, 29 Apr 2019 13:03:21 +0000 Subject: [PATCH 049/121] Update AMI List Build Number 96 aws-parallelcluster-cookbook Git hash: c0556ecc29a40d20225fe1d6bc40417c8759702d aws-parallelcluster-node Git hash: cd8e84016a974802267417dd5bd795922ca85fef Signed-off-by: ParallelCluster AMI bot --- amis.txt | 190 ++++++++++---------- cloudformation/aws-parallelcluster.cfn.json | 190 ++++++++++---------- 2 files changed, 190 insertions(+), 190 deletions(-) diff --git a/amis.txt b/amis.txt index ba448a7542..e4b601aa87 100644 --- a/amis.txt +++ b/amis.txt @@ -1,100 +1,100 @@ # alinux -ap-northeast-1: ami-08bbc6b440ae39278 -ap-northeast-2: ami-0b94cc099eb2afed1 -ap-northeast-3: ami-08f450d90bf811e75 -ap-south-1: ami-0991980c18e306e8e -ap-southeast-1: ami-0b28cded2d076c436 -ap-southeast-2: ami-0f6bd7f7e505539e1 -ca-central-1: ami-0b680d93b3d56c72a -cn-north-1: ami-0decad57dff4bb3ea -cn-northwest-1: ami-090c0f959ead7a813 -eu-central-1: ami-0f188f96685b31355 -eu-north-1: ami-08053d8c06274f531 -eu-west-1: ami-0fd242bc2abb1b146 -eu-west-2: ami-046b84d0cba5d292f -eu-west-3: ami-0046b758814704b9f -sa-east-1: ami-06cc41ea6409482e5 -us-east-1: ami-02787c6b3550be361 -us-east-2: ami-0c4b18bab302f095b -us-gov-east-1: ami-063828ccbf0ad3cfe -us-gov-west-1: ami-9bd7a6fa -us-west-1: ami-027a3a20855e7be03 -us-west-2: ami-00b4f23ba0a57a141 +ap-northeast-1: ami-072ac3e57034c3bb9 +ap-northeast-2: ami-080aecd84cc7434f0 +ap-northeast-3: ami-0b0ebcca7a6d3cc89 +ap-south-1: ami-0357da922b040bdb4 +ap-southeast-1: ami-0fa2ec4e5f0402d8d +ap-southeast-2: ami-00bcdbe79f9f9360f +ca-central-1: ami-06f86504395727a59 +cn-north-1: ami-0c7128d1d555a95f3 +cn-northwest-1: ami-0b838ce1114165ca1 +eu-central-1: ami-00917e29a34aa32ee +eu-north-1: ami-045b3be6347ece0fb +eu-west-1: ami-0b05e2f00ebee148c +eu-west-2: ami-0722446a78ea175f8 +eu-west-3: ami-0c54390b66173bd7d +sa-east-1: ami-0a47ff8d07b75a4e9 +us-east-1: ami-0d8d467ddca09f9ea +us-east-2: ami-0c8fd29db55b1fd99 +us-gov-east-1: ami-0646415808673ae07 +us-gov-west-1: ami-3b83f05a +us-west-1: ami-06258a4b05a6eb611 +us-west-2: ami-020c1ec32ad429b44 # centos6 -ap-northeast-1: ami-0b27f204a8adab23b -ap-northeast-2: ami-010f98c99b632ece4 -ap-northeast-3: ami-09dd084db04a10d89 -ap-south-1: ami-0f4b9b7eb6f7c8bf4 -ap-southeast-1: ami-0d756e8aa7628a1d9 -ap-southeast-2: ami-00e287b293a3b7978 -ca-central-1: ami-0f34197ba57c32264 -eu-central-1: ami-0560468e7585063ec -eu-north-1: ami-0b8ee035751765dd5 -eu-west-1: ami-07fb0fe1262b74210 -eu-west-2: ami-00bae65e0cb9fbd31 -eu-west-3: ami-0fa09ce7bae809182 -sa-east-1: ami-014a018823095eb14 -us-east-1: ami-0b0130c133fae6607 -us-east-2: ami-0f4af6aff781edc2e -us-west-1: ami-01fef859674ae14f1 -us-west-2: ami-0c0a6c53fc044e440 +ap-northeast-1: ami-0f5ea785473ed7ab5 +ap-northeast-2: ami-07c9635710bc567e6 +ap-northeast-3: ami-0b65ed5f7a0ff726a +ap-south-1: ami-0e18648a2d5b5e83c +ap-southeast-1: ami-0f784e8d9290e0f50 +ap-southeast-2: ami-0e5559991f7845038 +ca-central-1: ami-0ba632dba5f588a43 +eu-central-1: ami-04771fcea0287def6 +eu-north-1: ami-0adf183b1beb95c23 +eu-west-1: ami-0c965c8bf7c698110 +eu-west-2: ami-032a7702e546c7e42 +eu-west-3: ami-0b7b602a10f83961c +sa-east-1: ami-03309e4b0f7ef6ea5 +us-east-1: ami-0e2b588a37264f1cb +us-east-2: ami-09461a00d94eb74e5 +us-west-1: ami-04d12cccd51b29965 +us-west-2: ami-0e2efd8528c9ddb07 # centos7 -ap-northeast-1: ami-0784def0ccb2d5b82 -ap-northeast-2: ami-0d99fb81635d7615c -ap-northeast-3: ami-0c95572f59739c0bf -ap-south-1: ami-010b8728beb0383d6 -ap-southeast-1: ami-06ddf4654231a4d4c -ap-southeast-2: ami-0d1ab341d60525d01 -ca-central-1: ami-092fc6e2504e5f9a2 -eu-central-1: ami-003a4b7027af55f24 -eu-north-1: ami-071f5c908f62cc6cd -eu-west-1: ami-0bbd6b980d3313c64 -eu-west-2: ami-0907c15064d4edba5 -eu-west-3: ami-0125d2751f93718eb -sa-east-1: ami-07454a63f6b9e04de -us-east-1: ami-01f0260a02285b5dc -us-east-2: ami-0f9dab1d43d5744ed -us-west-1: ami-074340af697a98c37 -us-west-2: ami-0811b1738ec473064 +ap-northeast-1: ami-09b175846250d37e0 +ap-northeast-2: ami-05a371ea785419539 +ap-northeast-3: ami-056bfe9c042992212 +ap-south-1: ami-0cf72d39ece5fe7b8 +ap-southeast-1: ami-08e0a6e0d5e22df29 +ap-southeast-2: ami-04b6949a410a967ed +ca-central-1: ami-01cd6f223f4ca39af +eu-central-1: ami-00b98e76cefa6f7b8 +eu-north-1: ami-03bba2a04c7eee896 +eu-west-1: ami-0d7719633ec6daed8 +eu-west-2: ami-05bcb384a4c8b0c5b +eu-west-3: ami-054d53deecd76ad61 +sa-east-1: ami-089367ecc959c8bc4 +us-east-1: ami-0f942ac10a338af45 +us-east-2: ami-0edbaef69d274e64e +us-west-1: ami-036c33c35fa178b86 +us-west-2: ami-0bbd399489108361e # ubuntu1404 -ap-northeast-1: ami-0cd539d9e885af044 -ap-northeast-2: ami-0924984889a5d4a46 -ap-northeast-3: ami-07450b1aef436d233 -ap-south-1: ami-05f60d15408d42ac9 -ap-southeast-1: ami-0b1a46b1b9d2f4fb9 -ap-southeast-2: ami-0863af7ce35b4323f -ca-central-1: ami-09bbf1a83ca0f5ff0 -cn-north-1: ami-05f293ab117a7842d -eu-central-1: ami-0abac01d1d843346c -eu-north-1: ami-0bf9760df9cfca009 -eu-west-1: ami-091779c5c891c35c2 -eu-west-2: ami-07da7ca913a6ea97f -eu-west-3: ami-0427e1cae6b8e9f89 -sa-east-1: ami-0a228fdac1d498e8c -us-east-1: ami-05dbcb5308069c285 -us-east-2: ami-0a8e709dece19523d -us-gov-east-1: ami-0441f56840dcb67ca -us-gov-west-1: ami-a6d3a2c7 -us-west-1: ami-0e01f060099ea747b -us-west-2: ami-08d70cb46b697c172 +ap-northeast-1: ami-0348c76e890f528a1 +ap-northeast-2: ami-0c4b80f78e7f62047 +ap-northeast-3: ami-03aaa1e1ae3007767 +ap-south-1: ami-04317bb153d6a8fff +ap-southeast-1: ami-000ce989867413e4e +ap-southeast-2: ami-09a0b2d2bdc669a6c +ca-central-1: ami-0555c4a7c47384856 +cn-north-1: ami-0839700f2d6edb4e9 +eu-central-1: ami-0d35dcfebb277a64d +eu-north-1: ami-037dc7fd57f8c3151 +eu-west-1: ami-0c1b4a36f6da56db9 +eu-west-2: ami-08b285fd75a7a483b +eu-west-3: ami-0653aa7d98f3033ec +sa-east-1: ami-0024bc42c6c9cc832 +us-east-1: ami-063cb866ee81436bf +us-east-2: ami-0d7393fe329fac685 +us-gov-east-1: ami-06bcf13af6f8352b2 +us-gov-west-1: ami-a89deec9 +us-west-1: ami-0e9ab67cbeffe41b6 +us-west-2: ami-056be711a3a572d20 # ubuntu1604 -ap-northeast-1: ami-04b272dcdd06e1564 -ap-northeast-2: ami-0b3c32fb3a4e9c0ef -ap-northeast-3: ami-01f5ef9fbe48d7bb2 -ap-south-1: ami-0076ff722a254fdca -ap-southeast-1: ami-0a4ea1282cc83d41c -ap-southeast-2: ami-0fe13fe2f6911fa5d -ca-central-1: ami-0a7b7779aba55c024 -cn-north-1: ami-05d35ef899afade2f -eu-central-1: ami-0ae0084f39e8dce54 -eu-north-1: ami-0383b711246b9c840 -eu-west-1: ami-091dd999747fc79d7 -eu-west-2: ami-0cd1373102c0a4d5d -eu-west-3: ami-0f3bba95cd9c832d1 -sa-east-1: ami-00416702e6c84f6c7 -us-east-1: ami-00de8aa07f24052f6 -us-east-2: ami-0d7f19ca4f88c3044 -us-gov-east-1: ami-035e990927c75e7d8 -us-gov-west-1: ami-e1d4a580 -us-west-1: ami-0f02ce7616b3016af -us-west-2: ami-0b1d55af6440b3b96 +ap-northeast-1: ami-0446db0488549a643 +ap-northeast-2: ami-0b0bdd98877e4e506 +ap-northeast-3: ami-088f0307cdd06a985 +ap-south-1: ami-066e4cfcc460304e0 +ap-southeast-1: ami-0597821ac99a481e3 +ap-southeast-2: ami-03c9d6bdf2a2cba07 +ca-central-1: ami-0e4208d9ab789da40 +cn-north-1: ami-0e77f8ab0c91d2b9c +eu-central-1: ami-0457c581edab1213c +eu-north-1: ami-03f64544dad6ae6be +eu-west-1: ami-0aee65b23ac6121e5 +eu-west-2: ami-001486f35ffff87d9 +eu-west-3: ami-05641d46059f19f8f +sa-east-1: ami-00451f8f5429960a6 +us-east-1: ami-05736751ef500681f +us-east-2: ami-05f87726bd2802a79 +us-gov-east-1: ami-033192481089d3c64 +us-gov-west-1: ami-f3ec9f92 +us-west-1: ami-03ff95bfd61b9cb5e +us-west-2: ami-0d6d6a4cbeb63a57d diff --git a/cloudformation/aws-parallelcluster.cfn.json b/cloudformation/aws-parallelcluster.cfn.json index c4a7e1b362..c99096b19f 100644 --- a/cloudformation/aws-parallelcluster.cfn.json +++ b/cloudformation/aws-parallelcluster.cfn.json @@ -1200,141 +1200,141 @@ "Mappings": { "AWSRegionOS2AMI": { "ap-northeast-1": { - "alinux": "ami-08bbc6b440ae39278", - "centos6": "ami-0b27f204a8adab23b", - "centos7": "ami-0784def0ccb2d5b82", - "ubuntu1404": "ami-0cd539d9e885af044", - "ubuntu1604": "ami-04b272dcdd06e1564" + "alinux": "ami-072ac3e57034c3bb9", + "centos6": "ami-0f5ea785473ed7ab5", + "centos7": "ami-09b175846250d37e0", + "ubuntu1404": "ami-0348c76e890f528a1", + "ubuntu1604": "ami-0446db0488549a643" }, "ap-northeast-2": { - "alinux": "ami-0b94cc099eb2afed1", - "centos6": "ami-010f98c99b632ece4", - "centos7": "ami-0d99fb81635d7615c", - "ubuntu1404": "ami-0924984889a5d4a46", - "ubuntu1604": "ami-0b3c32fb3a4e9c0ef" + "alinux": "ami-080aecd84cc7434f0", + "centos6": "ami-07c9635710bc567e6", + "centos7": "ami-05a371ea785419539", + "ubuntu1404": "ami-0c4b80f78e7f62047", + "ubuntu1604": "ami-0b0bdd98877e4e506" }, "ap-northeast-3": { - "alinux": "ami-08f450d90bf811e75", - "centos6": "ami-09dd084db04a10d89", - "centos7": "ami-0c95572f59739c0bf", - "ubuntu1404": "ami-07450b1aef436d233", - "ubuntu1604": "ami-01f5ef9fbe48d7bb2" + "alinux": "ami-0b0ebcca7a6d3cc89", + "centos6": "ami-0b65ed5f7a0ff726a", + "centos7": "ami-056bfe9c042992212", + "ubuntu1404": "ami-03aaa1e1ae3007767", + "ubuntu1604": "ami-088f0307cdd06a985" }, "ap-south-1": { - "alinux": "ami-0991980c18e306e8e", - "centos6": "ami-0f4b9b7eb6f7c8bf4", - "centos7": "ami-010b8728beb0383d6", - "ubuntu1404": "ami-05f60d15408d42ac9", - "ubuntu1604": "ami-0076ff722a254fdca" + "alinux": "ami-0357da922b040bdb4", + "centos6": "ami-0e18648a2d5b5e83c", + "centos7": "ami-0cf72d39ece5fe7b8", + "ubuntu1404": "ami-04317bb153d6a8fff", + "ubuntu1604": "ami-066e4cfcc460304e0" }, "ap-southeast-1": { - "alinux": "ami-0b28cded2d076c436", - "centos6": "ami-0d756e8aa7628a1d9", - "centos7": "ami-06ddf4654231a4d4c", - "ubuntu1404": "ami-0b1a46b1b9d2f4fb9", - "ubuntu1604": "ami-0a4ea1282cc83d41c" + "alinux": "ami-0fa2ec4e5f0402d8d", + "centos6": "ami-0f784e8d9290e0f50", + "centos7": "ami-08e0a6e0d5e22df29", + "ubuntu1404": "ami-000ce989867413e4e", + "ubuntu1604": "ami-0597821ac99a481e3" }, "ap-southeast-2": { - "alinux": "ami-0f6bd7f7e505539e1", - "centos6": "ami-00e287b293a3b7978", - "centos7": "ami-0d1ab341d60525d01", - "ubuntu1404": "ami-0863af7ce35b4323f", - "ubuntu1604": "ami-0fe13fe2f6911fa5d" + "alinux": "ami-00bcdbe79f9f9360f", + "centos6": "ami-0e5559991f7845038", + "centos7": "ami-04b6949a410a967ed", + "ubuntu1404": "ami-09a0b2d2bdc669a6c", + "ubuntu1604": "ami-03c9d6bdf2a2cba07" }, "ca-central-1": { - "alinux": "ami-0b680d93b3d56c72a", - "centos6": "ami-0f34197ba57c32264", - "centos7": "ami-092fc6e2504e5f9a2", - "ubuntu1404": "ami-09bbf1a83ca0f5ff0", - "ubuntu1604": "ami-0a7b7779aba55c024" + "alinux": "ami-06f86504395727a59", + "centos6": "ami-0ba632dba5f588a43", + "centos7": "ami-01cd6f223f4ca39af", + "ubuntu1404": "ami-0555c4a7c47384856", + "ubuntu1604": "ami-0e4208d9ab789da40" }, "cn-north-1": { - "alinux": "ami-0decad57dff4bb3ea", - "ubuntu1404": "ami-05f293ab117a7842d", - "ubuntu1604": "ami-05d35ef899afade2f" + "alinux": "ami-0c7128d1d555a95f3", + "ubuntu1404": "ami-0839700f2d6edb4e9", + "ubuntu1604": "ami-0e77f8ab0c91d2b9c" }, "cn-northwest-1": { - "alinux": "ami-090c0f959ead7a813" + "alinux": "ami-0b838ce1114165ca1" }, "eu-central-1": { - "alinux": "ami-0f188f96685b31355", - "centos6": "ami-0560468e7585063ec", - "centos7": "ami-003a4b7027af55f24", - "ubuntu1404": "ami-0abac01d1d843346c", - "ubuntu1604": "ami-0ae0084f39e8dce54" + "alinux": "ami-00917e29a34aa32ee", + "centos6": "ami-04771fcea0287def6", + "centos7": "ami-00b98e76cefa6f7b8", + "ubuntu1404": "ami-0d35dcfebb277a64d", + "ubuntu1604": "ami-0457c581edab1213c" }, "eu-north-1": { - "alinux": "ami-08053d8c06274f531", - "centos6": "ami-0b8ee035751765dd5", - "centos7": "ami-071f5c908f62cc6cd", - "ubuntu1404": "ami-0bf9760df9cfca009", - "ubuntu1604": "ami-0383b711246b9c840" + "alinux": "ami-045b3be6347ece0fb", + "centos6": "ami-0adf183b1beb95c23", + "centos7": "ami-03bba2a04c7eee896", + "ubuntu1404": "ami-037dc7fd57f8c3151", + "ubuntu1604": "ami-03f64544dad6ae6be" }, "eu-west-1": { - "alinux": "ami-0fd242bc2abb1b146", - "centos6": "ami-07fb0fe1262b74210", - "centos7": "ami-0bbd6b980d3313c64", - "ubuntu1404": "ami-091779c5c891c35c2", - "ubuntu1604": "ami-091dd999747fc79d7" + "alinux": "ami-0b05e2f00ebee148c", + "centos6": "ami-0c965c8bf7c698110", + "centos7": "ami-0d7719633ec6daed8", + "ubuntu1404": "ami-0c1b4a36f6da56db9", + "ubuntu1604": "ami-0aee65b23ac6121e5" }, "eu-west-2": { - "alinux": "ami-046b84d0cba5d292f", - "centos6": "ami-00bae65e0cb9fbd31", - "centos7": "ami-0907c15064d4edba5", - "ubuntu1404": "ami-07da7ca913a6ea97f", - "ubuntu1604": "ami-0cd1373102c0a4d5d" + "alinux": "ami-0722446a78ea175f8", + "centos6": "ami-032a7702e546c7e42", + "centos7": "ami-05bcb384a4c8b0c5b", + "ubuntu1404": "ami-08b285fd75a7a483b", + "ubuntu1604": "ami-001486f35ffff87d9" }, "eu-west-3": { - "alinux": "ami-0046b758814704b9f", - "centos6": "ami-0fa09ce7bae809182", - "centos7": "ami-0125d2751f93718eb", - "ubuntu1404": "ami-0427e1cae6b8e9f89", - "ubuntu1604": "ami-0f3bba95cd9c832d1" + "alinux": "ami-0c54390b66173bd7d", + "centos6": "ami-0b7b602a10f83961c", + "centos7": "ami-054d53deecd76ad61", + "ubuntu1404": "ami-0653aa7d98f3033ec", + "ubuntu1604": "ami-05641d46059f19f8f" }, "sa-east-1": { - "alinux": "ami-06cc41ea6409482e5", - "centos6": "ami-014a018823095eb14", - "centos7": "ami-07454a63f6b9e04de", - "ubuntu1404": "ami-0a228fdac1d498e8c", - "ubuntu1604": "ami-00416702e6c84f6c7" + "alinux": "ami-0a47ff8d07b75a4e9", + "centos6": "ami-03309e4b0f7ef6ea5", + "centos7": "ami-089367ecc959c8bc4", + "ubuntu1404": "ami-0024bc42c6c9cc832", + "ubuntu1604": "ami-00451f8f5429960a6" }, "us-east-1": { - "alinux": "ami-02787c6b3550be361", - "centos6": "ami-0b0130c133fae6607", - "centos7": "ami-01f0260a02285b5dc", - "ubuntu1404": "ami-05dbcb5308069c285", - "ubuntu1604": "ami-00de8aa07f24052f6" + "alinux": "ami-0d8d467ddca09f9ea", + "centos6": "ami-0e2b588a37264f1cb", + "centos7": "ami-0f942ac10a338af45", + "ubuntu1404": "ami-063cb866ee81436bf", + "ubuntu1604": "ami-05736751ef500681f" }, "us-east-2": { - "alinux": "ami-0c4b18bab302f095b", - "centos6": "ami-0f4af6aff781edc2e", - "centos7": "ami-0f9dab1d43d5744ed", - "ubuntu1404": "ami-0a8e709dece19523d", - "ubuntu1604": "ami-0d7f19ca4f88c3044" + "alinux": "ami-0c8fd29db55b1fd99", + "centos6": "ami-09461a00d94eb74e5", + "centos7": "ami-0edbaef69d274e64e", + "ubuntu1404": "ami-0d7393fe329fac685", + "ubuntu1604": "ami-05f87726bd2802a79" }, "us-gov-east-1": { - "alinux": "ami-063828ccbf0ad3cfe", - "ubuntu1404": "ami-0441f56840dcb67ca", - "ubuntu1604": "ami-035e990927c75e7d8" + "alinux": "ami-0646415808673ae07", + "ubuntu1404": "ami-06bcf13af6f8352b2", + "ubuntu1604": "ami-033192481089d3c64" }, "us-gov-west-1": { - "alinux": "ami-9bd7a6fa", - "ubuntu1404": "ami-a6d3a2c7", - "ubuntu1604": "ami-e1d4a580" + "alinux": "ami-3b83f05a", + "ubuntu1404": "ami-a89deec9", + "ubuntu1604": "ami-f3ec9f92" }, "us-west-1": { - "alinux": "ami-027a3a20855e7be03", - "centos6": "ami-01fef859674ae14f1", - "centos7": "ami-074340af697a98c37", - "ubuntu1404": "ami-0e01f060099ea747b", - "ubuntu1604": "ami-0f02ce7616b3016af" + "alinux": "ami-06258a4b05a6eb611", + "centos6": "ami-04d12cccd51b29965", + "centos7": "ami-036c33c35fa178b86", + "ubuntu1404": "ami-0e9ab67cbeffe41b6", + "ubuntu1604": "ami-03ff95bfd61b9cb5e" }, "us-west-2": { - "alinux": "ami-00b4f23ba0a57a141", - "centos6": "ami-0c0a6c53fc044e440", - "centos7": "ami-0811b1738ec473064", - "ubuntu1404": "ami-08d70cb46b697c172", - "ubuntu1604": "ami-0b1d55af6440b3b96" + "alinux": "ami-020c1ec32ad429b44", + "centos6": "ami-0e2efd8528c9ddb07", + "centos7": "ami-0bbd399489108361e", + "ubuntu1404": "ami-056be711a3a572d20", + "ubuntu1604": "ami-0d6d6a4cbeb63a57d" } }, "OSFeatures": { From aa470f004d193708d2a7ca45363409bbfe09e281 Mon Sep 17 00:00:00 2001 From: Luca Carrogu Date: Mon, 29 Apr 2019 15:31:25 +0200 Subject: [PATCH 050/121] Add "name" filter to retrieve AMI list --- util/generate-ami-list.py | 1 + 1 file changed, 1 insertion(+) diff --git a/util/generate-ami-list.py b/util/generate-ami-list.py index b774eb4d5a..150e738644 100644 --- a/util/generate-ami-list.py +++ b/util/generate-ami-list.py @@ -62,6 +62,7 @@ def get_ami_list_from_ec2(regions, date, cookbook_git_ref, node_git_ref, version elif cookbook_git_ref and node_git_ref: filters.append({"Name": "tag:parallelcluster_cookbook_ref", "Values": ["%s" % cookbook_git_ref]}) filters.append({"Name": "tag:parallelcluster_node_ref", "Values": ["%s" % node_git_ref]}) + filters.append({"Name": "name", "Values": ["aws-parallelcluster-*"]}) else: print("Error: you can search for version and date or cookbook and node git reference") exit(1) From d1593be1761213f9a6ca21537520416edd21896e Mon Sep 17 00:00:00 2001 From: Luca Carrogu Date: Mon, 29 Apr 2019 16:23:13 +0200 Subject: [PATCH 051/121] Revert "Update AMI List Build Number 96 aws-parallelcluster-cookbook Git hash: c0556ecc29a40d20225fe1d6bc40417c8759702d aws-parallelcluster-node Git hash: cd8e84016a974802267417dd5bd795922ca85fef" This reverts commit 87137efc794fd8761da16b22f39246d3a6c74177. --- amis.txt | 190 ++++++++++---------- cloudformation/aws-parallelcluster.cfn.json | 190 ++++++++++---------- 2 files changed, 190 insertions(+), 190 deletions(-) diff --git a/amis.txt b/amis.txt index e4b601aa87..ba448a7542 100644 --- a/amis.txt +++ b/amis.txt @@ -1,100 +1,100 @@ # alinux -ap-northeast-1: ami-072ac3e57034c3bb9 -ap-northeast-2: ami-080aecd84cc7434f0 -ap-northeast-3: ami-0b0ebcca7a6d3cc89 -ap-south-1: ami-0357da922b040bdb4 -ap-southeast-1: ami-0fa2ec4e5f0402d8d -ap-southeast-2: ami-00bcdbe79f9f9360f -ca-central-1: ami-06f86504395727a59 -cn-north-1: ami-0c7128d1d555a95f3 -cn-northwest-1: ami-0b838ce1114165ca1 -eu-central-1: ami-00917e29a34aa32ee -eu-north-1: ami-045b3be6347ece0fb -eu-west-1: ami-0b05e2f00ebee148c -eu-west-2: ami-0722446a78ea175f8 -eu-west-3: ami-0c54390b66173bd7d -sa-east-1: ami-0a47ff8d07b75a4e9 -us-east-1: ami-0d8d467ddca09f9ea -us-east-2: ami-0c8fd29db55b1fd99 -us-gov-east-1: ami-0646415808673ae07 -us-gov-west-1: ami-3b83f05a -us-west-1: ami-06258a4b05a6eb611 -us-west-2: ami-020c1ec32ad429b44 +ap-northeast-1: ami-08bbc6b440ae39278 +ap-northeast-2: ami-0b94cc099eb2afed1 +ap-northeast-3: ami-08f450d90bf811e75 +ap-south-1: ami-0991980c18e306e8e +ap-southeast-1: ami-0b28cded2d076c436 +ap-southeast-2: ami-0f6bd7f7e505539e1 +ca-central-1: ami-0b680d93b3d56c72a +cn-north-1: ami-0decad57dff4bb3ea +cn-northwest-1: ami-090c0f959ead7a813 +eu-central-1: ami-0f188f96685b31355 +eu-north-1: ami-08053d8c06274f531 +eu-west-1: ami-0fd242bc2abb1b146 +eu-west-2: ami-046b84d0cba5d292f +eu-west-3: ami-0046b758814704b9f +sa-east-1: ami-06cc41ea6409482e5 +us-east-1: ami-02787c6b3550be361 +us-east-2: ami-0c4b18bab302f095b +us-gov-east-1: ami-063828ccbf0ad3cfe +us-gov-west-1: ami-9bd7a6fa +us-west-1: ami-027a3a20855e7be03 +us-west-2: ami-00b4f23ba0a57a141 # centos6 -ap-northeast-1: ami-0f5ea785473ed7ab5 -ap-northeast-2: ami-07c9635710bc567e6 -ap-northeast-3: ami-0b65ed5f7a0ff726a -ap-south-1: ami-0e18648a2d5b5e83c -ap-southeast-1: ami-0f784e8d9290e0f50 -ap-southeast-2: ami-0e5559991f7845038 -ca-central-1: ami-0ba632dba5f588a43 -eu-central-1: ami-04771fcea0287def6 -eu-north-1: ami-0adf183b1beb95c23 -eu-west-1: ami-0c965c8bf7c698110 -eu-west-2: ami-032a7702e546c7e42 -eu-west-3: ami-0b7b602a10f83961c -sa-east-1: ami-03309e4b0f7ef6ea5 -us-east-1: ami-0e2b588a37264f1cb -us-east-2: ami-09461a00d94eb74e5 -us-west-1: ami-04d12cccd51b29965 -us-west-2: ami-0e2efd8528c9ddb07 +ap-northeast-1: ami-0b27f204a8adab23b +ap-northeast-2: ami-010f98c99b632ece4 +ap-northeast-3: ami-09dd084db04a10d89 +ap-south-1: ami-0f4b9b7eb6f7c8bf4 +ap-southeast-1: ami-0d756e8aa7628a1d9 +ap-southeast-2: ami-00e287b293a3b7978 +ca-central-1: ami-0f34197ba57c32264 +eu-central-1: ami-0560468e7585063ec +eu-north-1: ami-0b8ee035751765dd5 +eu-west-1: ami-07fb0fe1262b74210 +eu-west-2: ami-00bae65e0cb9fbd31 +eu-west-3: ami-0fa09ce7bae809182 +sa-east-1: ami-014a018823095eb14 +us-east-1: ami-0b0130c133fae6607 +us-east-2: ami-0f4af6aff781edc2e +us-west-1: ami-01fef859674ae14f1 +us-west-2: ami-0c0a6c53fc044e440 # centos7 -ap-northeast-1: ami-09b175846250d37e0 -ap-northeast-2: ami-05a371ea785419539 -ap-northeast-3: ami-056bfe9c042992212 -ap-south-1: ami-0cf72d39ece5fe7b8 -ap-southeast-1: ami-08e0a6e0d5e22df29 -ap-southeast-2: ami-04b6949a410a967ed -ca-central-1: ami-01cd6f223f4ca39af -eu-central-1: ami-00b98e76cefa6f7b8 -eu-north-1: ami-03bba2a04c7eee896 -eu-west-1: ami-0d7719633ec6daed8 -eu-west-2: ami-05bcb384a4c8b0c5b -eu-west-3: ami-054d53deecd76ad61 -sa-east-1: ami-089367ecc959c8bc4 -us-east-1: ami-0f942ac10a338af45 -us-east-2: ami-0edbaef69d274e64e -us-west-1: ami-036c33c35fa178b86 -us-west-2: ami-0bbd399489108361e +ap-northeast-1: ami-0784def0ccb2d5b82 +ap-northeast-2: ami-0d99fb81635d7615c +ap-northeast-3: ami-0c95572f59739c0bf +ap-south-1: ami-010b8728beb0383d6 +ap-southeast-1: ami-06ddf4654231a4d4c +ap-southeast-2: ami-0d1ab341d60525d01 +ca-central-1: ami-092fc6e2504e5f9a2 +eu-central-1: ami-003a4b7027af55f24 +eu-north-1: ami-071f5c908f62cc6cd +eu-west-1: ami-0bbd6b980d3313c64 +eu-west-2: ami-0907c15064d4edba5 +eu-west-3: ami-0125d2751f93718eb +sa-east-1: ami-07454a63f6b9e04de +us-east-1: ami-01f0260a02285b5dc +us-east-2: ami-0f9dab1d43d5744ed +us-west-1: ami-074340af697a98c37 +us-west-2: ami-0811b1738ec473064 # ubuntu1404 -ap-northeast-1: ami-0348c76e890f528a1 -ap-northeast-2: ami-0c4b80f78e7f62047 -ap-northeast-3: ami-03aaa1e1ae3007767 -ap-south-1: ami-04317bb153d6a8fff -ap-southeast-1: ami-000ce989867413e4e -ap-southeast-2: ami-09a0b2d2bdc669a6c -ca-central-1: ami-0555c4a7c47384856 -cn-north-1: ami-0839700f2d6edb4e9 -eu-central-1: ami-0d35dcfebb277a64d -eu-north-1: ami-037dc7fd57f8c3151 -eu-west-1: ami-0c1b4a36f6da56db9 -eu-west-2: ami-08b285fd75a7a483b -eu-west-3: ami-0653aa7d98f3033ec -sa-east-1: ami-0024bc42c6c9cc832 -us-east-1: ami-063cb866ee81436bf -us-east-2: ami-0d7393fe329fac685 -us-gov-east-1: ami-06bcf13af6f8352b2 -us-gov-west-1: ami-a89deec9 -us-west-1: ami-0e9ab67cbeffe41b6 -us-west-2: ami-056be711a3a572d20 +ap-northeast-1: ami-0cd539d9e885af044 +ap-northeast-2: ami-0924984889a5d4a46 +ap-northeast-3: ami-07450b1aef436d233 +ap-south-1: ami-05f60d15408d42ac9 +ap-southeast-1: ami-0b1a46b1b9d2f4fb9 +ap-southeast-2: ami-0863af7ce35b4323f +ca-central-1: ami-09bbf1a83ca0f5ff0 +cn-north-1: ami-05f293ab117a7842d +eu-central-1: ami-0abac01d1d843346c +eu-north-1: ami-0bf9760df9cfca009 +eu-west-1: ami-091779c5c891c35c2 +eu-west-2: ami-07da7ca913a6ea97f +eu-west-3: ami-0427e1cae6b8e9f89 +sa-east-1: ami-0a228fdac1d498e8c +us-east-1: ami-05dbcb5308069c285 +us-east-2: ami-0a8e709dece19523d +us-gov-east-1: ami-0441f56840dcb67ca +us-gov-west-1: ami-a6d3a2c7 +us-west-1: ami-0e01f060099ea747b +us-west-2: ami-08d70cb46b697c172 # ubuntu1604 -ap-northeast-1: ami-0446db0488549a643 -ap-northeast-2: ami-0b0bdd98877e4e506 -ap-northeast-3: ami-088f0307cdd06a985 -ap-south-1: ami-066e4cfcc460304e0 -ap-southeast-1: ami-0597821ac99a481e3 -ap-southeast-2: ami-03c9d6bdf2a2cba07 -ca-central-1: ami-0e4208d9ab789da40 -cn-north-1: ami-0e77f8ab0c91d2b9c -eu-central-1: ami-0457c581edab1213c -eu-north-1: ami-03f64544dad6ae6be -eu-west-1: ami-0aee65b23ac6121e5 -eu-west-2: ami-001486f35ffff87d9 -eu-west-3: ami-05641d46059f19f8f -sa-east-1: ami-00451f8f5429960a6 -us-east-1: ami-05736751ef500681f -us-east-2: ami-05f87726bd2802a79 -us-gov-east-1: ami-033192481089d3c64 -us-gov-west-1: ami-f3ec9f92 -us-west-1: ami-03ff95bfd61b9cb5e -us-west-2: ami-0d6d6a4cbeb63a57d +ap-northeast-1: ami-04b272dcdd06e1564 +ap-northeast-2: ami-0b3c32fb3a4e9c0ef +ap-northeast-3: ami-01f5ef9fbe48d7bb2 +ap-south-1: ami-0076ff722a254fdca +ap-southeast-1: ami-0a4ea1282cc83d41c +ap-southeast-2: ami-0fe13fe2f6911fa5d +ca-central-1: ami-0a7b7779aba55c024 +cn-north-1: ami-05d35ef899afade2f +eu-central-1: ami-0ae0084f39e8dce54 +eu-north-1: ami-0383b711246b9c840 +eu-west-1: ami-091dd999747fc79d7 +eu-west-2: ami-0cd1373102c0a4d5d +eu-west-3: ami-0f3bba95cd9c832d1 +sa-east-1: ami-00416702e6c84f6c7 +us-east-1: ami-00de8aa07f24052f6 +us-east-2: ami-0d7f19ca4f88c3044 +us-gov-east-1: ami-035e990927c75e7d8 +us-gov-west-1: ami-e1d4a580 +us-west-1: ami-0f02ce7616b3016af +us-west-2: ami-0b1d55af6440b3b96 diff --git a/cloudformation/aws-parallelcluster.cfn.json b/cloudformation/aws-parallelcluster.cfn.json index c99096b19f..c4a7e1b362 100644 --- a/cloudformation/aws-parallelcluster.cfn.json +++ b/cloudformation/aws-parallelcluster.cfn.json @@ -1200,141 +1200,141 @@ "Mappings": { "AWSRegionOS2AMI": { "ap-northeast-1": { - "alinux": "ami-072ac3e57034c3bb9", - "centos6": "ami-0f5ea785473ed7ab5", - "centos7": "ami-09b175846250d37e0", - "ubuntu1404": "ami-0348c76e890f528a1", - "ubuntu1604": "ami-0446db0488549a643" + "alinux": "ami-08bbc6b440ae39278", + "centos6": "ami-0b27f204a8adab23b", + "centos7": "ami-0784def0ccb2d5b82", + "ubuntu1404": "ami-0cd539d9e885af044", + "ubuntu1604": "ami-04b272dcdd06e1564" }, "ap-northeast-2": { - "alinux": "ami-080aecd84cc7434f0", - "centos6": "ami-07c9635710bc567e6", - "centos7": "ami-05a371ea785419539", - "ubuntu1404": "ami-0c4b80f78e7f62047", - "ubuntu1604": "ami-0b0bdd98877e4e506" + "alinux": "ami-0b94cc099eb2afed1", + "centos6": "ami-010f98c99b632ece4", + "centos7": "ami-0d99fb81635d7615c", + "ubuntu1404": "ami-0924984889a5d4a46", + "ubuntu1604": "ami-0b3c32fb3a4e9c0ef" }, "ap-northeast-3": { - "alinux": "ami-0b0ebcca7a6d3cc89", - "centos6": "ami-0b65ed5f7a0ff726a", - "centos7": "ami-056bfe9c042992212", - "ubuntu1404": "ami-03aaa1e1ae3007767", - "ubuntu1604": "ami-088f0307cdd06a985" + "alinux": "ami-08f450d90bf811e75", + "centos6": "ami-09dd084db04a10d89", + "centos7": "ami-0c95572f59739c0bf", + "ubuntu1404": "ami-07450b1aef436d233", + "ubuntu1604": "ami-01f5ef9fbe48d7bb2" }, "ap-south-1": { - "alinux": "ami-0357da922b040bdb4", - "centos6": "ami-0e18648a2d5b5e83c", - "centos7": "ami-0cf72d39ece5fe7b8", - "ubuntu1404": "ami-04317bb153d6a8fff", - "ubuntu1604": "ami-066e4cfcc460304e0" + "alinux": "ami-0991980c18e306e8e", + "centos6": "ami-0f4b9b7eb6f7c8bf4", + "centos7": "ami-010b8728beb0383d6", + "ubuntu1404": "ami-05f60d15408d42ac9", + "ubuntu1604": "ami-0076ff722a254fdca" }, "ap-southeast-1": { - "alinux": "ami-0fa2ec4e5f0402d8d", - "centos6": "ami-0f784e8d9290e0f50", - "centos7": "ami-08e0a6e0d5e22df29", - "ubuntu1404": "ami-000ce989867413e4e", - "ubuntu1604": "ami-0597821ac99a481e3" + "alinux": "ami-0b28cded2d076c436", + "centos6": "ami-0d756e8aa7628a1d9", + "centos7": "ami-06ddf4654231a4d4c", + "ubuntu1404": "ami-0b1a46b1b9d2f4fb9", + "ubuntu1604": "ami-0a4ea1282cc83d41c" }, "ap-southeast-2": { - "alinux": "ami-00bcdbe79f9f9360f", - "centos6": "ami-0e5559991f7845038", - "centos7": "ami-04b6949a410a967ed", - "ubuntu1404": "ami-09a0b2d2bdc669a6c", - "ubuntu1604": "ami-03c9d6bdf2a2cba07" + "alinux": "ami-0f6bd7f7e505539e1", + "centos6": "ami-00e287b293a3b7978", + "centos7": "ami-0d1ab341d60525d01", + "ubuntu1404": "ami-0863af7ce35b4323f", + "ubuntu1604": "ami-0fe13fe2f6911fa5d" }, "ca-central-1": { - "alinux": "ami-06f86504395727a59", - "centos6": "ami-0ba632dba5f588a43", - "centos7": "ami-01cd6f223f4ca39af", - "ubuntu1404": "ami-0555c4a7c47384856", - "ubuntu1604": "ami-0e4208d9ab789da40" + "alinux": "ami-0b680d93b3d56c72a", + "centos6": "ami-0f34197ba57c32264", + "centos7": "ami-092fc6e2504e5f9a2", + "ubuntu1404": "ami-09bbf1a83ca0f5ff0", + "ubuntu1604": "ami-0a7b7779aba55c024" }, "cn-north-1": { - "alinux": "ami-0c7128d1d555a95f3", - "ubuntu1404": "ami-0839700f2d6edb4e9", - "ubuntu1604": "ami-0e77f8ab0c91d2b9c" + "alinux": "ami-0decad57dff4bb3ea", + "ubuntu1404": "ami-05f293ab117a7842d", + "ubuntu1604": "ami-05d35ef899afade2f" }, "cn-northwest-1": { - "alinux": "ami-0b838ce1114165ca1" + "alinux": "ami-090c0f959ead7a813" }, "eu-central-1": { - "alinux": "ami-00917e29a34aa32ee", - "centos6": "ami-04771fcea0287def6", - "centos7": "ami-00b98e76cefa6f7b8", - "ubuntu1404": "ami-0d35dcfebb277a64d", - "ubuntu1604": "ami-0457c581edab1213c" + "alinux": "ami-0f188f96685b31355", + "centos6": "ami-0560468e7585063ec", + "centos7": "ami-003a4b7027af55f24", + "ubuntu1404": "ami-0abac01d1d843346c", + "ubuntu1604": "ami-0ae0084f39e8dce54" }, "eu-north-1": { - "alinux": "ami-045b3be6347ece0fb", - "centos6": "ami-0adf183b1beb95c23", - "centos7": "ami-03bba2a04c7eee896", - "ubuntu1404": "ami-037dc7fd57f8c3151", - "ubuntu1604": "ami-03f64544dad6ae6be" + "alinux": "ami-08053d8c06274f531", + "centos6": "ami-0b8ee035751765dd5", + "centos7": "ami-071f5c908f62cc6cd", + "ubuntu1404": "ami-0bf9760df9cfca009", + "ubuntu1604": "ami-0383b711246b9c840" }, "eu-west-1": { - "alinux": "ami-0b05e2f00ebee148c", - "centos6": "ami-0c965c8bf7c698110", - "centos7": "ami-0d7719633ec6daed8", - "ubuntu1404": "ami-0c1b4a36f6da56db9", - "ubuntu1604": "ami-0aee65b23ac6121e5" + "alinux": "ami-0fd242bc2abb1b146", + "centos6": "ami-07fb0fe1262b74210", + "centos7": "ami-0bbd6b980d3313c64", + "ubuntu1404": "ami-091779c5c891c35c2", + "ubuntu1604": "ami-091dd999747fc79d7" }, "eu-west-2": { - "alinux": "ami-0722446a78ea175f8", - "centos6": "ami-032a7702e546c7e42", - "centos7": "ami-05bcb384a4c8b0c5b", - "ubuntu1404": "ami-08b285fd75a7a483b", - "ubuntu1604": "ami-001486f35ffff87d9" + "alinux": "ami-046b84d0cba5d292f", + "centos6": "ami-00bae65e0cb9fbd31", + "centos7": "ami-0907c15064d4edba5", + "ubuntu1404": "ami-07da7ca913a6ea97f", + "ubuntu1604": "ami-0cd1373102c0a4d5d" }, "eu-west-3": { - "alinux": "ami-0c54390b66173bd7d", - "centos6": "ami-0b7b602a10f83961c", - "centos7": "ami-054d53deecd76ad61", - "ubuntu1404": "ami-0653aa7d98f3033ec", - "ubuntu1604": "ami-05641d46059f19f8f" + "alinux": "ami-0046b758814704b9f", + "centos6": "ami-0fa09ce7bae809182", + "centos7": "ami-0125d2751f93718eb", + "ubuntu1404": "ami-0427e1cae6b8e9f89", + "ubuntu1604": "ami-0f3bba95cd9c832d1" }, "sa-east-1": { - "alinux": "ami-0a47ff8d07b75a4e9", - "centos6": "ami-03309e4b0f7ef6ea5", - "centos7": "ami-089367ecc959c8bc4", - "ubuntu1404": "ami-0024bc42c6c9cc832", - "ubuntu1604": "ami-00451f8f5429960a6" + "alinux": "ami-06cc41ea6409482e5", + "centos6": "ami-014a018823095eb14", + "centos7": "ami-07454a63f6b9e04de", + "ubuntu1404": "ami-0a228fdac1d498e8c", + "ubuntu1604": "ami-00416702e6c84f6c7" }, "us-east-1": { - "alinux": "ami-0d8d467ddca09f9ea", - "centos6": "ami-0e2b588a37264f1cb", - "centos7": "ami-0f942ac10a338af45", - "ubuntu1404": "ami-063cb866ee81436bf", - "ubuntu1604": "ami-05736751ef500681f" + "alinux": "ami-02787c6b3550be361", + "centos6": "ami-0b0130c133fae6607", + "centos7": "ami-01f0260a02285b5dc", + "ubuntu1404": "ami-05dbcb5308069c285", + "ubuntu1604": "ami-00de8aa07f24052f6" }, "us-east-2": { - "alinux": "ami-0c8fd29db55b1fd99", - "centos6": "ami-09461a00d94eb74e5", - "centos7": "ami-0edbaef69d274e64e", - "ubuntu1404": "ami-0d7393fe329fac685", - "ubuntu1604": "ami-05f87726bd2802a79" + "alinux": "ami-0c4b18bab302f095b", + "centos6": "ami-0f4af6aff781edc2e", + "centos7": "ami-0f9dab1d43d5744ed", + "ubuntu1404": "ami-0a8e709dece19523d", + "ubuntu1604": "ami-0d7f19ca4f88c3044" }, "us-gov-east-1": { - "alinux": "ami-0646415808673ae07", - "ubuntu1404": "ami-06bcf13af6f8352b2", - "ubuntu1604": "ami-033192481089d3c64" + "alinux": "ami-063828ccbf0ad3cfe", + "ubuntu1404": "ami-0441f56840dcb67ca", + "ubuntu1604": "ami-035e990927c75e7d8" }, "us-gov-west-1": { - "alinux": "ami-3b83f05a", - "ubuntu1404": "ami-a89deec9", - "ubuntu1604": "ami-f3ec9f92" + "alinux": "ami-9bd7a6fa", + "ubuntu1404": "ami-a6d3a2c7", + "ubuntu1604": "ami-e1d4a580" }, "us-west-1": { - "alinux": "ami-06258a4b05a6eb611", - "centos6": "ami-04d12cccd51b29965", - "centos7": "ami-036c33c35fa178b86", - "ubuntu1404": "ami-0e9ab67cbeffe41b6", - "ubuntu1604": "ami-03ff95bfd61b9cb5e" + "alinux": "ami-027a3a20855e7be03", + "centos6": "ami-01fef859674ae14f1", + "centos7": "ami-074340af697a98c37", + "ubuntu1404": "ami-0e01f060099ea747b", + "ubuntu1604": "ami-0f02ce7616b3016af" }, "us-west-2": { - "alinux": "ami-020c1ec32ad429b44", - "centos6": "ami-0e2efd8528c9ddb07", - "centos7": "ami-0bbd399489108361e", - "ubuntu1404": "ami-056be711a3a572d20", - "ubuntu1604": "ami-0d6d6a4cbeb63a57d" + "alinux": "ami-00b4f23ba0a57a141", + "centos6": "ami-0c0a6c53fc044e440", + "centos7": "ami-0811b1738ec473064", + "ubuntu1404": "ami-08d70cb46b697c172", + "ubuntu1604": "ami-0b1d55af6440b3b96" } }, "OSFeatures": { From 9fc747627012e30df534226335ee489da1a2ef80 Mon Sep 17 00:00:00 2001 From: ParallelCluster AMI bot Date: Mon, 29 Apr 2019 14:31:16 +0000 Subject: [PATCH 052/121] Update AMI List Build Number 96 aws-parallelcluster-cookbook Git hash: c0556ecc29a40d20225fe1d6bc40417c8759702d aws-parallelcluster-node Git hash: cd8e84016a974802267417dd5bd795922ca85fef Signed-off-by: ParallelCluster AMI bot --- amis.txt | 190 ++++++++++---------- cloudformation/aws-parallelcluster.cfn.json | 190 ++++++++++---------- 2 files changed, 190 insertions(+), 190 deletions(-) diff --git a/amis.txt b/amis.txt index ba448a7542..d9ebd68032 100644 --- a/amis.txt +++ b/amis.txt @@ -1,100 +1,100 @@ # alinux -ap-northeast-1: ami-08bbc6b440ae39278 -ap-northeast-2: ami-0b94cc099eb2afed1 -ap-northeast-3: ami-08f450d90bf811e75 -ap-south-1: ami-0991980c18e306e8e -ap-southeast-1: ami-0b28cded2d076c436 -ap-southeast-2: ami-0f6bd7f7e505539e1 -ca-central-1: ami-0b680d93b3d56c72a -cn-north-1: ami-0decad57dff4bb3ea -cn-northwest-1: ami-090c0f959ead7a813 -eu-central-1: ami-0f188f96685b31355 -eu-north-1: ami-08053d8c06274f531 -eu-west-1: ami-0fd242bc2abb1b146 -eu-west-2: ami-046b84d0cba5d292f -eu-west-3: ami-0046b758814704b9f -sa-east-1: ami-06cc41ea6409482e5 -us-east-1: ami-02787c6b3550be361 -us-east-2: ami-0c4b18bab302f095b -us-gov-east-1: ami-063828ccbf0ad3cfe -us-gov-west-1: ami-9bd7a6fa -us-west-1: ami-027a3a20855e7be03 -us-west-2: ami-00b4f23ba0a57a141 +ap-northeast-1: ami-072ac3e57034c3bb9 +ap-northeast-2: ami-080aecd84cc7434f0 +ap-northeast-3: ami-0b0ebcca7a6d3cc89 +ap-south-1: ami-0357da922b040bdb4 +ap-southeast-1: ami-0fa2ec4e5f0402d8d +ap-southeast-2: ami-00bcdbe79f9f9360f +ca-central-1: ami-06f86504395727a59 +cn-north-1: ami-0c7128d1d555a95f3 +cn-northwest-1: ami-0b838ce1114165ca1 +eu-central-1: ami-00917e29a34aa32ee +eu-north-1: ami-045b3be6347ece0fb +eu-west-1: ami-0b05e2f00ebee148c +eu-west-2: ami-0722446a78ea175f8 +eu-west-3: ami-0c54390b66173bd7d +sa-east-1: ami-0a47ff8d07b75a4e9 +us-east-1: ami-0d8d467ddca09f9ea +us-east-2: ami-0c8fd29db55b1fd99 +us-gov-east-1: ami-0646415808673ae07 +us-gov-west-1: ami-3b83f05a +us-west-1: ami-06258a4b05a6eb611 +us-west-2: ami-020c1ec32ad429b44 # centos6 -ap-northeast-1: ami-0b27f204a8adab23b -ap-northeast-2: ami-010f98c99b632ece4 -ap-northeast-3: ami-09dd084db04a10d89 -ap-south-1: ami-0f4b9b7eb6f7c8bf4 -ap-southeast-1: ami-0d756e8aa7628a1d9 -ap-southeast-2: ami-00e287b293a3b7978 -ca-central-1: ami-0f34197ba57c32264 -eu-central-1: ami-0560468e7585063ec -eu-north-1: ami-0b8ee035751765dd5 -eu-west-1: ami-07fb0fe1262b74210 -eu-west-2: ami-00bae65e0cb9fbd31 -eu-west-3: ami-0fa09ce7bae809182 -sa-east-1: ami-014a018823095eb14 -us-east-1: ami-0b0130c133fae6607 -us-east-2: ami-0f4af6aff781edc2e -us-west-1: ami-01fef859674ae14f1 -us-west-2: ami-0c0a6c53fc044e440 +ap-northeast-1: ami-0f5ea785473ed7ab5 +ap-northeast-2: ami-07c9635710bc567e6 +ap-northeast-3: ami-0b65ed5f7a0ff726a +ap-south-1: ami-0e18648a2d5b5e83c +ap-southeast-1: ami-0f784e8d9290e0f50 +ap-southeast-2: ami-0e5559991f7845038 +ca-central-1: ami-0ba632dba5f588a43 +eu-central-1: ami-04771fcea0287def6 +eu-north-1: ami-0adf183b1beb95c23 +eu-west-1: ami-0c965c8bf7c698110 +eu-west-2: ami-032a7702e546c7e42 +eu-west-3: ami-0b7b602a10f83961c +sa-east-1: ami-03309e4b0f7ef6ea5 +us-east-1: ami-0e2b588a37264f1cb +us-east-2: ami-09461a00d94eb74e5 +us-west-1: ami-04d12cccd51b29965 +us-west-2: ami-0e2efd8528c9ddb07 # centos7 -ap-northeast-1: ami-0784def0ccb2d5b82 -ap-northeast-2: ami-0d99fb81635d7615c -ap-northeast-3: ami-0c95572f59739c0bf -ap-south-1: ami-010b8728beb0383d6 -ap-southeast-1: ami-06ddf4654231a4d4c -ap-southeast-2: ami-0d1ab341d60525d01 -ca-central-1: ami-092fc6e2504e5f9a2 -eu-central-1: ami-003a4b7027af55f24 -eu-north-1: ami-071f5c908f62cc6cd -eu-west-1: ami-0bbd6b980d3313c64 -eu-west-2: ami-0907c15064d4edba5 -eu-west-3: ami-0125d2751f93718eb -sa-east-1: ami-07454a63f6b9e04de -us-east-1: ami-01f0260a02285b5dc -us-east-2: ami-0f9dab1d43d5744ed -us-west-1: ami-074340af697a98c37 -us-west-2: ami-0811b1738ec473064 +ap-northeast-1: ami-09b175846250d37e0 +ap-northeast-2: ami-05a371ea785419539 +ap-northeast-3: ami-056bfe9c042992212 +ap-south-1: ami-0cf72d39ece5fe7b8 +ap-southeast-1: ami-08e0a6e0d5e22df29 +ap-southeast-2: ami-04b6949a410a967ed +ca-central-1: ami-01cd6f223f4ca39af +eu-central-1: ami-00b98e76cefa6f7b8 +eu-north-1: ami-03bba2a04c7eee896 +eu-west-1: ami-0d7719633ec6daed8 +eu-west-2: ami-05bcb384a4c8b0c5b +eu-west-3: ami-054d53deecd76ad61 +sa-east-1: ami-089367ecc959c8bc4 +us-east-1: ami-0f942ac10a338af45 +us-east-2: ami-0edbaef69d274e64e +us-west-1: ami-036c33c35fa178b86 +us-west-2: ami-0bbd399489108361e # ubuntu1404 -ap-northeast-1: ami-0cd539d9e885af044 -ap-northeast-2: ami-0924984889a5d4a46 -ap-northeast-3: ami-07450b1aef436d233 -ap-south-1: ami-05f60d15408d42ac9 -ap-southeast-1: ami-0b1a46b1b9d2f4fb9 -ap-southeast-2: ami-0863af7ce35b4323f -ca-central-1: ami-09bbf1a83ca0f5ff0 -cn-north-1: ami-05f293ab117a7842d -eu-central-1: ami-0abac01d1d843346c -eu-north-1: ami-0bf9760df9cfca009 -eu-west-1: ami-091779c5c891c35c2 -eu-west-2: ami-07da7ca913a6ea97f -eu-west-3: ami-0427e1cae6b8e9f89 -sa-east-1: ami-0a228fdac1d498e8c -us-east-1: ami-05dbcb5308069c285 -us-east-2: ami-0a8e709dece19523d -us-gov-east-1: ami-0441f56840dcb67ca -us-gov-west-1: ami-a6d3a2c7 -us-west-1: ami-0e01f060099ea747b -us-west-2: ami-08d70cb46b697c172 +ap-northeast-1: ami-0348c76e890f528a1 +ap-northeast-2: ami-0c4b80f78e7f62047 +ap-northeast-3: ami-03aaa1e1ae3007767 +ap-south-1: ami-04317bb153d6a8fff +ap-southeast-1: ami-000ce989867413e4e +ap-southeast-2: ami-09a0b2d2bdc669a6c +ca-central-1: ami-0555c4a7c47384856 +cn-north-1: ami-0839700f2d6edb4e9 +eu-central-1: ami-0d35dcfebb277a64d +eu-north-1: ami-037dc7fd57f8c3151 +eu-west-1: ami-0c1b4a36f6da56db9 +eu-west-2: ami-08b285fd75a7a483b +eu-west-3: ami-0653aa7d98f3033ec +sa-east-1: ami-0024bc42c6c9cc832 +us-east-1: ami-00144db1528d8942d +us-east-2: ami-0d7393fe329fac685 +us-gov-east-1: ami-06bcf13af6f8352b2 +us-gov-west-1: ami-a89deec9 +us-west-1: ami-0e9ab67cbeffe41b6 +us-west-2: ami-056be711a3a572d20 # ubuntu1604 -ap-northeast-1: ami-04b272dcdd06e1564 -ap-northeast-2: ami-0b3c32fb3a4e9c0ef -ap-northeast-3: ami-01f5ef9fbe48d7bb2 -ap-south-1: ami-0076ff722a254fdca -ap-southeast-1: ami-0a4ea1282cc83d41c -ap-southeast-2: ami-0fe13fe2f6911fa5d -ca-central-1: ami-0a7b7779aba55c024 -cn-north-1: ami-05d35ef899afade2f -eu-central-1: ami-0ae0084f39e8dce54 -eu-north-1: ami-0383b711246b9c840 -eu-west-1: ami-091dd999747fc79d7 -eu-west-2: ami-0cd1373102c0a4d5d -eu-west-3: ami-0f3bba95cd9c832d1 -sa-east-1: ami-00416702e6c84f6c7 -us-east-1: ami-00de8aa07f24052f6 -us-east-2: ami-0d7f19ca4f88c3044 -us-gov-east-1: ami-035e990927c75e7d8 -us-gov-west-1: ami-e1d4a580 -us-west-1: ami-0f02ce7616b3016af -us-west-2: ami-0b1d55af6440b3b96 +ap-northeast-1: ami-0446db0488549a643 +ap-northeast-2: ami-0b0bdd98877e4e506 +ap-northeast-3: ami-088f0307cdd06a985 +ap-south-1: ami-066e4cfcc460304e0 +ap-southeast-1: ami-0597821ac99a481e3 +ap-southeast-2: ami-03c9d6bdf2a2cba07 +ca-central-1: ami-0e4208d9ab789da40 +cn-north-1: ami-0e77f8ab0c91d2b9c +eu-central-1: ami-0457c581edab1213c +eu-north-1: ami-03f64544dad6ae6be +eu-west-1: ami-0aee65b23ac6121e5 +eu-west-2: ami-001486f35ffff87d9 +eu-west-3: ami-05641d46059f19f8f +sa-east-1: ami-00451f8f5429960a6 +us-east-1: ami-02b5b8dab90e057c4 +us-east-2: ami-05f87726bd2802a79 +us-gov-east-1: ami-033192481089d3c64 +us-gov-west-1: ami-f3ec9f92 +us-west-1: ami-03ff95bfd61b9cb5e +us-west-2: ami-0d6d6a4cbeb63a57d diff --git a/cloudformation/aws-parallelcluster.cfn.json b/cloudformation/aws-parallelcluster.cfn.json index c4a7e1b362..68eb976260 100644 --- a/cloudformation/aws-parallelcluster.cfn.json +++ b/cloudformation/aws-parallelcluster.cfn.json @@ -1200,141 +1200,141 @@ "Mappings": { "AWSRegionOS2AMI": { "ap-northeast-1": { - "alinux": "ami-08bbc6b440ae39278", - "centos6": "ami-0b27f204a8adab23b", - "centos7": "ami-0784def0ccb2d5b82", - "ubuntu1404": "ami-0cd539d9e885af044", - "ubuntu1604": "ami-04b272dcdd06e1564" + "alinux": "ami-072ac3e57034c3bb9", + "centos6": "ami-0f5ea785473ed7ab5", + "centos7": "ami-09b175846250d37e0", + "ubuntu1404": "ami-0348c76e890f528a1", + "ubuntu1604": "ami-0446db0488549a643" }, "ap-northeast-2": { - "alinux": "ami-0b94cc099eb2afed1", - "centos6": "ami-010f98c99b632ece4", - "centos7": "ami-0d99fb81635d7615c", - "ubuntu1404": "ami-0924984889a5d4a46", - "ubuntu1604": "ami-0b3c32fb3a4e9c0ef" + "alinux": "ami-080aecd84cc7434f0", + "centos6": "ami-07c9635710bc567e6", + "centos7": "ami-05a371ea785419539", + "ubuntu1404": "ami-0c4b80f78e7f62047", + "ubuntu1604": "ami-0b0bdd98877e4e506" }, "ap-northeast-3": { - "alinux": "ami-08f450d90bf811e75", - "centos6": "ami-09dd084db04a10d89", - "centos7": "ami-0c95572f59739c0bf", - "ubuntu1404": "ami-07450b1aef436d233", - "ubuntu1604": "ami-01f5ef9fbe48d7bb2" + "alinux": "ami-0b0ebcca7a6d3cc89", + "centos6": "ami-0b65ed5f7a0ff726a", + "centos7": "ami-056bfe9c042992212", + "ubuntu1404": "ami-03aaa1e1ae3007767", + "ubuntu1604": "ami-088f0307cdd06a985" }, "ap-south-1": { - "alinux": "ami-0991980c18e306e8e", - "centos6": "ami-0f4b9b7eb6f7c8bf4", - "centos7": "ami-010b8728beb0383d6", - "ubuntu1404": "ami-05f60d15408d42ac9", - "ubuntu1604": "ami-0076ff722a254fdca" + "alinux": "ami-0357da922b040bdb4", + "centos6": "ami-0e18648a2d5b5e83c", + "centos7": "ami-0cf72d39ece5fe7b8", + "ubuntu1404": "ami-04317bb153d6a8fff", + "ubuntu1604": "ami-066e4cfcc460304e0" }, "ap-southeast-1": { - "alinux": "ami-0b28cded2d076c436", - "centos6": "ami-0d756e8aa7628a1d9", - "centos7": "ami-06ddf4654231a4d4c", - "ubuntu1404": "ami-0b1a46b1b9d2f4fb9", - "ubuntu1604": "ami-0a4ea1282cc83d41c" + "alinux": "ami-0fa2ec4e5f0402d8d", + "centos6": "ami-0f784e8d9290e0f50", + "centos7": "ami-08e0a6e0d5e22df29", + "ubuntu1404": "ami-000ce989867413e4e", + "ubuntu1604": "ami-0597821ac99a481e3" }, "ap-southeast-2": { - "alinux": "ami-0f6bd7f7e505539e1", - "centos6": "ami-00e287b293a3b7978", - "centos7": "ami-0d1ab341d60525d01", - "ubuntu1404": "ami-0863af7ce35b4323f", - "ubuntu1604": "ami-0fe13fe2f6911fa5d" + "alinux": "ami-00bcdbe79f9f9360f", + "centos6": "ami-0e5559991f7845038", + "centos7": "ami-04b6949a410a967ed", + "ubuntu1404": "ami-09a0b2d2bdc669a6c", + "ubuntu1604": "ami-03c9d6bdf2a2cba07" }, "ca-central-1": { - "alinux": "ami-0b680d93b3d56c72a", - "centos6": "ami-0f34197ba57c32264", - "centos7": "ami-092fc6e2504e5f9a2", - "ubuntu1404": "ami-09bbf1a83ca0f5ff0", - "ubuntu1604": "ami-0a7b7779aba55c024" + "alinux": "ami-06f86504395727a59", + "centos6": "ami-0ba632dba5f588a43", + "centos7": "ami-01cd6f223f4ca39af", + "ubuntu1404": "ami-0555c4a7c47384856", + "ubuntu1604": "ami-0e4208d9ab789da40" }, "cn-north-1": { - "alinux": "ami-0decad57dff4bb3ea", - "ubuntu1404": "ami-05f293ab117a7842d", - "ubuntu1604": "ami-05d35ef899afade2f" + "alinux": "ami-0c7128d1d555a95f3", + "ubuntu1404": "ami-0839700f2d6edb4e9", + "ubuntu1604": "ami-0e77f8ab0c91d2b9c" }, "cn-northwest-1": { - "alinux": "ami-090c0f959ead7a813" + "alinux": "ami-0b838ce1114165ca1" }, "eu-central-1": { - "alinux": "ami-0f188f96685b31355", - "centos6": "ami-0560468e7585063ec", - "centos7": "ami-003a4b7027af55f24", - "ubuntu1404": "ami-0abac01d1d843346c", - "ubuntu1604": "ami-0ae0084f39e8dce54" + "alinux": "ami-00917e29a34aa32ee", + "centos6": "ami-04771fcea0287def6", + "centos7": "ami-00b98e76cefa6f7b8", + "ubuntu1404": "ami-0d35dcfebb277a64d", + "ubuntu1604": "ami-0457c581edab1213c" }, "eu-north-1": { - "alinux": "ami-08053d8c06274f531", - "centos6": "ami-0b8ee035751765dd5", - "centos7": "ami-071f5c908f62cc6cd", - "ubuntu1404": "ami-0bf9760df9cfca009", - "ubuntu1604": "ami-0383b711246b9c840" + "alinux": "ami-045b3be6347ece0fb", + "centos6": "ami-0adf183b1beb95c23", + "centos7": "ami-03bba2a04c7eee896", + "ubuntu1404": "ami-037dc7fd57f8c3151", + "ubuntu1604": "ami-03f64544dad6ae6be" }, "eu-west-1": { - "alinux": "ami-0fd242bc2abb1b146", - "centos6": "ami-07fb0fe1262b74210", - "centos7": "ami-0bbd6b980d3313c64", - "ubuntu1404": "ami-091779c5c891c35c2", - "ubuntu1604": "ami-091dd999747fc79d7" + "alinux": "ami-0b05e2f00ebee148c", + "centos6": "ami-0c965c8bf7c698110", + "centos7": "ami-0d7719633ec6daed8", + "ubuntu1404": "ami-0c1b4a36f6da56db9", + "ubuntu1604": "ami-0aee65b23ac6121e5" }, "eu-west-2": { - "alinux": "ami-046b84d0cba5d292f", - "centos6": "ami-00bae65e0cb9fbd31", - "centos7": "ami-0907c15064d4edba5", - "ubuntu1404": "ami-07da7ca913a6ea97f", - "ubuntu1604": "ami-0cd1373102c0a4d5d" + "alinux": "ami-0722446a78ea175f8", + "centos6": "ami-032a7702e546c7e42", + "centos7": "ami-05bcb384a4c8b0c5b", + "ubuntu1404": "ami-08b285fd75a7a483b", + "ubuntu1604": "ami-001486f35ffff87d9" }, "eu-west-3": { - "alinux": "ami-0046b758814704b9f", - "centos6": "ami-0fa09ce7bae809182", - "centos7": "ami-0125d2751f93718eb", - "ubuntu1404": "ami-0427e1cae6b8e9f89", - "ubuntu1604": "ami-0f3bba95cd9c832d1" + "alinux": "ami-0c54390b66173bd7d", + "centos6": "ami-0b7b602a10f83961c", + "centos7": "ami-054d53deecd76ad61", + "ubuntu1404": "ami-0653aa7d98f3033ec", + "ubuntu1604": "ami-05641d46059f19f8f" }, "sa-east-1": { - "alinux": "ami-06cc41ea6409482e5", - "centos6": "ami-014a018823095eb14", - "centos7": "ami-07454a63f6b9e04de", - "ubuntu1404": "ami-0a228fdac1d498e8c", - "ubuntu1604": "ami-00416702e6c84f6c7" + "alinux": "ami-0a47ff8d07b75a4e9", + "centos6": "ami-03309e4b0f7ef6ea5", + "centos7": "ami-089367ecc959c8bc4", + "ubuntu1404": "ami-0024bc42c6c9cc832", + "ubuntu1604": "ami-00451f8f5429960a6" }, "us-east-1": { - "alinux": "ami-02787c6b3550be361", - "centos6": "ami-0b0130c133fae6607", - "centos7": "ami-01f0260a02285b5dc", - "ubuntu1404": "ami-05dbcb5308069c285", - "ubuntu1604": "ami-00de8aa07f24052f6" + "alinux": "ami-0d8d467ddca09f9ea", + "centos6": "ami-0e2b588a37264f1cb", + "centos7": "ami-0f942ac10a338af45", + "ubuntu1404": "ami-00144db1528d8942d", + "ubuntu1604": "ami-02b5b8dab90e057c4" }, "us-east-2": { - "alinux": "ami-0c4b18bab302f095b", - "centos6": "ami-0f4af6aff781edc2e", - "centos7": "ami-0f9dab1d43d5744ed", - "ubuntu1404": "ami-0a8e709dece19523d", - "ubuntu1604": "ami-0d7f19ca4f88c3044" + "alinux": "ami-0c8fd29db55b1fd99", + "centos6": "ami-09461a00d94eb74e5", + "centos7": "ami-0edbaef69d274e64e", + "ubuntu1404": "ami-0d7393fe329fac685", + "ubuntu1604": "ami-05f87726bd2802a79" }, "us-gov-east-1": { - "alinux": "ami-063828ccbf0ad3cfe", - "ubuntu1404": "ami-0441f56840dcb67ca", - "ubuntu1604": "ami-035e990927c75e7d8" + "alinux": "ami-0646415808673ae07", + "ubuntu1404": "ami-06bcf13af6f8352b2", + "ubuntu1604": "ami-033192481089d3c64" }, "us-gov-west-1": { - "alinux": "ami-9bd7a6fa", - "ubuntu1404": "ami-a6d3a2c7", - "ubuntu1604": "ami-e1d4a580" + "alinux": "ami-3b83f05a", + "ubuntu1404": "ami-a89deec9", + "ubuntu1604": "ami-f3ec9f92" }, "us-west-1": { - "alinux": "ami-027a3a20855e7be03", - "centos6": "ami-01fef859674ae14f1", - "centos7": "ami-074340af697a98c37", - "ubuntu1404": "ami-0e01f060099ea747b", - "ubuntu1604": "ami-0f02ce7616b3016af" + "alinux": "ami-06258a4b05a6eb611", + "centos6": "ami-04d12cccd51b29965", + "centos7": "ami-036c33c35fa178b86", + "ubuntu1404": "ami-0e9ab67cbeffe41b6", + "ubuntu1604": "ami-03ff95bfd61b9cb5e" }, "us-west-2": { - "alinux": "ami-00b4f23ba0a57a141", - "centos6": "ami-0c0a6c53fc044e440", - "centos7": "ami-0811b1738ec473064", - "ubuntu1404": "ami-08d70cb46b697c172", - "ubuntu1604": "ami-0b1d55af6440b3b96" + "alinux": "ami-020c1ec32ad429b44", + "centos6": "ami-0e2efd8528c9ddb07", + "centos7": "ami-0bbd399489108361e", + "ubuntu1404": "ami-056be711a3a572d20", + "ubuntu1604": "ami-0d6d6a4cbeb63a57d" } }, "OSFeatures": { From 755d9c8e96e147e94e33a7cc163b96af512772fe Mon Sep 17 00:00:00 2001 From: Ian Colle Date: Mon, 29 Apr 2019 11:30:09 -0700 Subject: [PATCH 053/121] Update CONTRIBUTING.md Fixed typo to resolve LICENSE link. --- CONTRIBUTING.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 6b15a46776..60963ea9c8 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -56,6 +56,6 @@ If you discover a potential security issue in this project we ask that you notif ## Licensing -See the [LICENSE](https://github.com/aws/aws-parallelcluster/blob/develop/LICENSE) file for our project's licensing. We will ask you to confirm the licensing of your contribution. +See the [LICENSE](https://github.com/aws/aws-parallelcluster/blob/develop/LICENSE.txt) file for our project's licensing. We will ask you to confirm the licensing of your contribution. We may ask you to sign a [Contributor License Agreement (CLA)](http://en.wikipedia.org/wiki/Contributor_License_Agreement) for larger changes. From 2924fa49112ac4fe29d47b9bb06840c51aaedea6 Mon Sep 17 00:00:00 2001 From: Sean Smith Date: Tue, 16 Apr 2019 16:06:51 -0700 Subject: [PATCH 054/121] Change sanity_check to not enforce 0.0.0.0/0 CIDR Previously, sanity_check forced users to open up their security group to 0.0.0.0/0, in order to use FSx or EFS. This isn't necessary to use the service and many users wanted more granularity in their security_group permissions. This patch checks the port only, for FSx that's `988` and `2049` for EFS. [1] https://docs.aws.amazon.com/efs/latest/ug/security-considerations.html#network-access [2] https://docs.aws.amazon.com/fsx/latest/LustreGuide/limit-access-security-groups.html Signed-off-by: Sean Smith --- cli/pcluster/config_sanity.py | 31 +++++++++++++++---------------- 1 file changed, 15 insertions(+), 16 deletions(-) diff --git a/cli/pcluster/config_sanity.py b/cli/pcluster/config_sanity.py index 4c3eeec998..94827927bd 100644 --- a/cli/pcluster/config_sanity.py +++ b/cli/pcluster/config_sanity.py @@ -49,26 +49,25 @@ def __get_partition(self): @staticmethod def __check_sg_rules_for_port(rule, port_to_check): """ - Verify if the security group rule accepts connections to the given port. + Verify if the security group rule accepts connections on the given port. :param rule: The rule to check :param port_to_check: The port to check :return: True if the rule accepts connection, False otherwise """ - port = rule.get("FromPort") - ip_rules = rule.get("IpRanges") - group = rule.get("UserIdGroupPairs") + from_port = rule.get("FromPort") + to_port = rule.get("ToPort") + ip_protocol = rule.get("IpProtocol") - is_valid = False - for ip_rule in ip_rules: - ip = ip_rule.get("CidrIp") - # An existing rule is valid for EFS if, it allows all traffic(0.0.0.0/0) - # from all ports or the given port, and does not have a security group restriction - if (not port or port == port_to_check) and ip == "0.0.0.0/0" and not group: - is_valid = True - break + # if ip_protocol is -1, all ports are allowed + if ip_protocol == "-1": + return True + # tcp == protocol 6, + # if the ip_protocol is tcp, from_port and to_port must >= 0 and <= 65535 + if (ip_protocol in ["tcp", "6"]) and (from_port <= port_to_check <= to_port): + return True - return is_valid + return False def __check_efs_fs_id(self, ec2, efs, resource_value): # noqa: C901 FIXME!!! try: @@ -112,8 +111,8 @@ def __check_efs_fs_id(self, ec2, efs, resource_value): # noqa: C901 FIXME!!! self.__fail( "EFSFSId", "There is an existing Mount Target %s in the Availability Zone %s for EFS %s, " - "and it does not have a security group with inbound and outbound rules that support NFS. " - "Please modify the Mount Target's security group, or delete the Mount Target." + "but it does not have a security group that allows inbound and outbound rules to support NFS. " + "Please modify the Mount Target's security group, to allow traffic on port 2049." % (mt_id, availability_zone, resource_value[0]), ) except ClientError as e: @@ -171,7 +170,7 @@ def __check_fsx_fs_id(self, ec2, fsx, resource_value): "FSXFSId", "The current security group settings on file system %s does not satisfy " "mounting requirement. The file system must be associated to a security group that allows " - "inbound and outbound TCP traffic from 0.0.0.0/0 through port 988." % resource_value[0], + "inbound and outbound TCP traffic through port 988." % resource_value[0], ) return True except ClientError as e: From bfcb695c1d4ce921900156294404e4f41f4dc71a Mon Sep 17 00:00:00 2001 From: Francesco De Martino Date: Fri, 26 Apr 2019 14:30:45 +0200 Subject: [PATCH 055/121] integ tests - slurm: verify cluster doesn't scale up when job has invalid cpu requirements This tests https://github.com/aws/aws-parallelcluster-node/pull/134 Signed-off-by: Francesco De Martino --- .../tests/schedulers/test_slurm.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/tests/integration-tests/tests/schedulers/test_slurm.py b/tests/integration-tests/tests/schedulers/test_slurm.py index 5c3433241d..d0538e7de9 100644 --- a/tests/integration-tests/tests/schedulers/test_slurm.py +++ b/tests/integration-tests/tests/schedulers/test_slurm.py @@ -122,10 +122,19 @@ def _test_cluster_limits(remote_command_executor, max_queue_size, region, asg_na logging.info("Testing cluster doesn't scale when job requires a capacity that is higher than the max available") slurm_commands = SlurmCommands(remote_command_executor) result = slurm_commands.submit_command("sleep 1", nodes=max_queue_size + 1) - job_id = slurm_commands.assert_job_submitted(result.stdout) + max_nodes_job_id = slurm_commands.assert_job_submitted(result.stdout) + result = remote_command_executor.run_remote_command("sbatch -N 1 --wrap='sleep 1' --cpus-per-task 5") + max_cpu_job_id = slurm_commands.assert_job_submitted(result.stdout) + # Wait for reason to be computed time.sleep(3) - assert_that(_get_job_info(remote_command_executor, job_id)).contains("JobState=PENDING Reason=PartitionNodeLimit") + assert_that(_get_job_info(remote_command_executor, max_nodes_job_id)).contains( + "JobState=PENDING Reason=PartitionNodeLimit" + ) + assert_that(_get_job_info(remote_command_executor, max_cpu_job_id)).contains( + "JobState=PENDING Reason=Nodes_required_for_job_are_DOWN,_DRAINED" + "_or_reserved_for_jobs_in_higher_priority_partitions" + ) # Check we are not scaling time.sleep(60) From 2ec725926f6dd38df102a34efbf8616a4c294c17 Mon Sep 17 00:00:00 2001 From: Sean Smith Date: Tue, 30 Apr 2019 10:54:49 -0700 Subject: [PATCH 056/121] Change no-reponse bot to 7 day close --- .github/no-response.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/no-response.yml b/.github/no-response.yml index f8f7ef35ca..82e40cc8a4 100644 --- a/.github/no-response.yml +++ b/.github/no-response.yml @@ -1,7 +1,7 @@ # Configuration for probot-no-response - https://github.com/probot/no-response # Number of days of inactivity before an Issue is closed for lack of response -daysUntilClose: 14 +daysUntilClose: 7 # Label requiring a response responseRequiredLabel: closing-soon-if-no-response # Comment to post when closing an Issue for lack of response. Set to `false` to disable From d5dcd7386a6f4d4871782f6348d7461e099d0d23 Mon Sep 17 00:00:00 2001 From: Sean Smith Date: Thu, 25 Apr 2019 21:12:10 -0600 Subject: [PATCH 057/121] Version ParallelCluster Stacks * Adds a version tag to newly created stacks * Disable using a template **other** than the one used to create the stack in an update * List the version and status during `pcluster list` * Color the output based on stack status Signed-off-by: Sean Smith --- cli/pcluster/cli.py | 9 +++++--- cli/pcluster/pcluster.py | 48 +++++++++++++++++++++++++++++++++++----- docs/configuration.rst | 2 ++ 3 files changed, 51 insertions(+), 8 deletions(-) diff --git a/cli/pcluster/cli.py b/cli/pcluster/cli.py index dd0f18aa5f..20438ba04e 100644 --- a/cli/pcluster/cli.py +++ b/cli/pcluster/cli.py @@ -22,6 +22,8 @@ from pcluster import easyconfig, pcluster +LOGGER = logging.getLogger("pcluster.pcluster") + def create(args): pcluster.create(args) @@ -56,7 +58,8 @@ def update(args): def version(args): - pcluster.version(args) + version = pcluster.version() + LOGGER.info(version) def start(args): @@ -162,7 +165,7 @@ def _get_parser(): # update command subparser pupdate = subparsers.add_parser( "update", - help="Updates a running cluster using the values in the config " "file or in a TEMPLATE_URL provided.", + help="Updates a running cluster using the values in the config file.", epilog="When the command is called and it begins polling for the status of that call, " 'it is safe to "Ctrl-C" out. You can always return to that status by ' 'calling "pcluster status mycluster".', @@ -178,7 +181,6 @@ def _get_parser(): default=False, help="Disable CloudFormation stack rollback on error.", ) - pupdate.add_argument("-u", "--template-url", help="Specifies the URL for a custom CloudFormation template.") pupdate.add_argument("-t", "--cluster-template", help="Indicates which cluster template to use.") pupdate.add_argument("-p", "--extra-parameters", help="Adds extra parameters to the stack update.") pupdate.add_argument( @@ -245,6 +247,7 @@ def _get_parser(): help="Displays a list of stacks associated with AWS ParallelCluster.", epilog="This command lists the names of any CloudFormation stacks named parallelcluster-*", ) + plist.add_argument("--color", action="store_true", default=False, help="Display the cluster status in color.") _addarg_config(plist) _addarg_region(plist) plist.set_defaults(func=list_stacks) diff --git a/cli/pcluster/pcluster.py b/cli/pcluster/pcluster.py index 84b684f78d..5063677307 100644 --- a/cli/pcluster/pcluster.py +++ b/cli/pcluster/pcluster.py @@ -34,6 +34,7 @@ import boto3 import pkg_resources from botocore.exceptions import ClientError +from tabulate import tabulate from . import cfnconfig, utils @@ -63,9 +64,9 @@ def create_bucket_with_batch_resources(stack_name, aws_client_config, resources_ return s3_bucket_name -def version(args): +def version(): pcluster_version = pkg_resources.get_distribution("aws-parallelcluster").version - LOGGER.info(pcluster_version) + return pcluster_version def create(args): # noqa: C901 FIXME!!! @@ -116,6 +117,7 @@ def create(args): # noqa: C901 FIXME!!! cfn_params = [{"ParameterKey": key, "ParameterValue": value} for key, value in config.parameters.items()] tags = [{"Key": t, "Value": config.tags[t]} for t in config.tags] + tags.append({"Key": "Version", "Value": version()}) stack = cfn.create_stack( StackName=stack_name, @@ -275,12 +277,12 @@ def update(args): # noqa: C901 FIXME!!! config.parameters["AvailabilityZone"] = availability_zone try: - LOGGER.debug((config.template_url, config.parameters)) + LOGGER.debug(config.parameters) cfn_params = [{"ParameterKey": key, "ParameterValue": value} for key, value in config.parameters.items()] LOGGER.info("Calling update_stack") cfn.update_stack( - StackName=stack_name, TemplateURL=config.template_url, Parameters=cfn_params, Capabilities=capabilities + StackName=stack_name, UsePreviousTemplate=True, Parameters=cfn_params, Capabilities=capabilities ) status = cfn.describe_stacks(StackName=stack_name).get("Stacks")[0].get("StackStatus") if not args.nowait: @@ -370,6 +372,33 @@ def stop(args): set_asg_limits(asg_name=asg_name, config=config, min=0, max=0, desired=0) +def get_version(stack): + """ + Get the version of the stack if tagged. + + :param stack: stack object + :return: version or empty string + """ + tags = filter(lambda x: x.get("Key") == "Version", stack.get("Tags")) + return list(tags)[0].get("Value") if len(list(tags)) > 0 else "" + + +def colorize(stack_status, args): + """ + Color the output, COMPLETE = green, FAILED = red, IN_PROGRESS = yellow. + + :param status: stack status + :return: colorized status string + """ + if not args.color: + return stack_status + end = "0m" + status_to_color = {"COMPLETE": "0;32m", "FAILED": "0;31m", "IN_PROGRESS": "10;33m"} + for status in status_to_color: + if status in stack_status: + return "\033[%s%s\033[%s" % (status_to_color[status], stack_status, end) + + def list_stacks(args): config = cfnconfig.ParallelClusterConfig(args) cfn = boto3.client( @@ -380,9 +409,18 @@ def list_stacks(args): ) try: stacks = cfn.describe_stacks().get("Stacks") + result = [] for stack in stacks: if stack.get("ParentId") is None and stack.get("StackName").startswith("parallelcluster-"): - LOGGER.info("%s", stack.get("StackName")[len("parallelcluster-") :]) # noqa: E203 + pcluster_version = get_version(stack) + result.append( + [ + stack.get("StackName")[len("parallelcluster-") :], # noqa: E203 + colorize(stack.get("StackStatus"), args), + pcluster_version, + ] + ) + LOGGER.info(tabulate(result, tablefmt="plain")) except ClientError as e: LOGGER.critical(e.response.get("Error").get("Message")) sys.exit(1) diff --git a/docs/configuration.rst b/docs/configuration.rst index 1426680c8c..4b3a7127e3 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -100,6 +100,8 @@ template_url """""""""""" Defines the path to the CloudFormation template used to create the cluster. +Updates use the template the stack was created with. + Defaults to ``https://s3.amazonaws.com/-aws-parallelcluster/templates/aws-parallelcluster-.cfn.json``. :: From 7b9b5a4d82b8d11810df01f6c9e9b9aafc5c082d Mon Sep 17 00:00:00 2001 From: Francesco De Martino Date: Tue, 30 Apr 2019 15:32:35 +0200 Subject: [PATCH 058/121] Add finalize recipe execution at the end of user data The finalize recipe starts the node daemons and sends the COMPUTE_READY event if the node is part of the ComputeFleet. This solves the issue of daemons being started before the end of chef recipes and post_install script. Signed-off-by: Francesco De Martino --- cloudformation/aws-parallelcluster.cfn.json | 29 ++++++++++++++------- 1 file changed, 20 insertions(+), 9 deletions(-) diff --git a/cloudformation/aws-parallelcluster.cfn.json b/cloudformation/aws-parallelcluster.cfn.json index 68eb976260..cf05dc35e5 100644 --- a/cloudformation/aws-parallelcluster.cfn.json +++ b/cloudformation/aws-parallelcluster.cfn.json @@ -2370,7 +2370,8 @@ "chefPrepEnv", "shellRunPreInstall", "chefConfig", - "shellRunPostInstall" + "shellRunPostInstall", + "chefFinalize" ] }, "deployConfigFiles": { @@ -2581,6 +2582,14 @@ "command": "/opt/parallelcluster/scripts/fetch_and_run -postinstall" } } + }, + "chefFinalize": { + "commands": { + "chef": { + "command": "chef-client --local-mode --config /etc/chef/client.rb --log_level auto --force-formatter --no-color --chef-zero-port 8889 --json-attributes /etc/chef/dna.json --override-runlist aws-parallelcluster::finalize", + "cwd": "/etc/chef" + } + } } } }, @@ -3193,7 +3202,7 @@ "shellRunPreInstall", "chefConfig", "shellRunPostInstall", - "signalComputeReady" + "chefFinalize" ] }, "deployConfigFiles": { @@ -3379,10 +3388,11 @@ } } }, - "signalComputeReady": { + "chefFinalize": { "commands": { - "compute_ready": { - "command": "/opt/parallelcluster/scripts/compute_ready" + "chef": { + "command": "chef-client --local-mode --config /etc/chef/client.rb --log_level auto --force-formatter --no-color --chef-zero-port 8889 --json-attributes /etc/chef/dna.json --override-runlist aws-parallelcluster::finalize", + "cwd": "/etc/chef" } } } @@ -3858,7 +3868,7 @@ "shellRunPreInstall", "chefConfig", "shellRunPostInstall", - "signalComputeReady" + "chefFinalize" ] }, "deployConfigFiles": { @@ -4044,10 +4054,11 @@ } } }, - "signalComputeReady": { + "chefFinalize": { "commands": { - "compute_ready": { - "command": "/opt/parallelcluster/scripts/compute_ready" + "chef": { + "command": "chef-client --local-mode --config /etc/chef/client.rb --log_level auto --force-formatter --no-color --chef-zero-port 8889 --json-attributes /etc/chef/dna.json --override-runlist aws-parallelcluster::finalize", + "cwd": "/etc/chef" } } } From 5ada2aaafbbde0d04ac2a534dfb454aa8e22843e Mon Sep 17 00:00:00 2001 From: Francesco De Martino Date: Wed, 1 May 2019 14:15:25 +0200 Subject: [PATCH 059/121] Mark compute instance as Unhealthy in ASG on cfn-init failures Before marking the instance as Unhealthy and have ASG replace it the /var/log directory is gzipped and saved to the shared directory /home/logs/compute/ in an archive named $instance_id.tar.gz. This solves the issue of compute nodes remaining up and running forever in case of cfn-init failure without the possibility of being recovered. Signed-off-by: Francesco De Martino --- cloudformation/aws-parallelcluster.cfn.json | 41 ++++++++++++++------- docs/iam.rst | 3 +- util/uploadTemplate.sh | 0 3 files changed, 30 insertions(+), 14 deletions(-) mode change 100644 => 100755 util/uploadTemplate.sh diff --git a/cloudformation/aws-parallelcluster.cfn.json b/cloudformation/aws-parallelcluster.cfn.json index cf05dc35e5..9dfefe6b59 100644 --- a/cloudformation/aws-parallelcluster.cfn.json +++ b/cloudformation/aws-parallelcluster.cfn.json @@ -1695,7 +1695,8 @@ "autoscaling:TerminateInstanceInAutoScalingGroup", "autoscaling:SetDesiredCapacity", "autoscaling:UpdateAutoScalingGroup", - "autoscaling:DescribeTags" + "autoscaling:DescribeTags", + "autoscaling:SetInstanceHealth" ], "Effect": "Allow", "Resource": [ @@ -3013,15 +3014,22 @@ "#!/bin/bash -x\n\n", "function error_exit\n", "{\n", - " cfn-signal ${proxy_args} --exit-code=1 --reason=\"$1\" --stack=", - { - "Ref": "AWS::StackName" - }, - " --resource=ComputeFleet --region=", + " region=", { "Ref": "AWS::Region" }, "\n", + " instance_id=$(curl --retry 3 --retry-delay 0 --silent --fail http://169.254.169.254/latest/meta-data/instance-id)\n", + " log_dir=/home/logs/compute\n", + " mkdir -p ${log_dir}\n", + " echo \"Reporting instance as unhealthy and dumping logs to ${log_dir}/${instance_id}.tar.gz\"\n", + " tar -czf ${log_dir}/${instance_id}.tar.gz /var/log\n", + " aws --region ${region} autoscaling set-instance-health --instance-id ${instance_id} --health-status Unhealthy\n", + " cfn-signal ${proxy_args} --exit-code=1 --reason=\"$1\" --stack=", + { + "Ref": "AWS::StackName" + }, + " --resource=ComputeFleet --region=${region}\n", " exit 1\n", "}\n", "function vendor_cookbook\n", @@ -3176,7 +3184,7 @@ "Ref": "AWS::Region" }, " || error_exit 'Failed to run cfn-init. If --norollback was specified, check /var/log/cfn-init.log and /var/log/cloud-init-output.log.'\n", - "cfn-signal ${proxy_args} --exit-code=0 --reason=\"MasterServer setup complete\" --stack=", + "cfn-signal ${proxy_args} --exit-code=0 --reason=\"ComputeServer setup complete\" --stack=", { "Ref": "AWS::StackName" }, @@ -3677,15 +3685,22 @@ "#!/bin/bash -x\n\n", "function error_exit\n", "{\n", - " cfn-signal ${proxy_args} --exit-code=1 --reason=\"$1\" --stack=", - { - "Ref": "AWS::StackName" - }, - " --resource=ComputeFleet --region=", + " region=", { "Ref": "AWS::Region" }, "\n", + " instance_id=$(curl --retry 3 --retry-delay 0 --silent --fail http://169.254.169.254/latest/meta-data/instance-id)\n", + " log_dir=/home/logs/compute\n", + " mkdir -p ${log_dir}\n", + " echo \"Reporting instance as unhealthy and dumping logs to ${log_dir}/${instance_id}.tar.gz\"\n", + " tar -czf ${log_dir}/${instance_id}.tar.gz /var/log\n", + " aws --region ${region} autoscaling set-instance-health --instance-id ${instance_id} --health-status Unhealthy\n", + " cfn-signal ${proxy_args} --exit-code=1 --reason=\"$1\" --stack=", + { + "Ref": "AWS::StackName" + }, + " --resource=ComputeFleet --region=${region}\n", " exit 1\n", "}\n", "function vendor_cookbook\n", @@ -3840,7 +3855,7 @@ "Ref": "AWS::Region" }, " || error_exit 'Failed to run cfn-init. If --norollback was specified, check /var/log/cfn-init.log and /var/log/cloud-init-output.log.'\n", - "cfn-signal ${proxy_args} --exit-code=0 --reason=\"MasterServer setup complete\" --stack=", + "cfn-signal ${proxy_args} --exit-code=0 --reason=\"ComputeServer setup complete\" --stack=", { "Ref": "AWS::StackName" }, diff --git a/docs/iam.rst b/docs/iam.rst index 40787131e3..276ee17d03 100644 --- a/docs/iam.rst +++ b/docs/iam.rst @@ -86,7 +86,8 @@ In case you are using SGE, Slurm or Torque as a scheduler: "autoscaling:TerminateInstanceInAutoScalingGroup", "autoscaling:SetDesiredCapacity", "autoscaling:DescribeTags", - "autoScaling:UpdateAutoScalingGroup" + "autoScaling:UpdateAutoScalingGroup", + "autoscaling:SetInstanceHealth" ], "Sid": "Autoscaling", "Effect": "Allow" diff --git a/util/uploadTemplate.sh b/util/uploadTemplate.sh old mode 100644 new mode 100755 From de8d1d34c39e926bb7eef9ab6fc12648b66f8f5f Mon Sep 17 00:00:00 2001 From: Enrico Usai Date: Fri, 3 May 2019 16:20:36 +0200 Subject: [PATCH 060/121] cli: speed up version evaluation in pcluster list command We are avoiding to create a list multiple times, the iteration can stop once the match is found and we also have a default value. Signed-off-by: Enrico Usai --- cli/pcluster/pcluster.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/cli/pcluster/pcluster.py b/cli/pcluster/pcluster.py index 5063677307..1c1615dc71 100644 --- a/cli/pcluster/pcluster.py +++ b/cli/pcluster/pcluster.py @@ -379,8 +379,7 @@ def get_version(stack): :param stack: stack object :return: version or empty string """ - tags = filter(lambda x: x.get("Key") == "Version", stack.get("Tags")) - return list(tags)[0].get("Value") if len(list(tags)) > 0 else "" + return next((tag.get("Value") for tag in stack.get("Tags") if tag.get("Key") == "Version"), "") def colorize(stack_status, args): From a06279309c381ee80c380c7446dda9658621f197 Mon Sep 17 00:00:00 2001 From: Francesco De Martino Date: Thu, 2 May 2019 13:16:59 +0200 Subject: [PATCH 061/121] integ tests: test compute replacement on cfn-init failures This tests the changes implemented in https://github.com/aws/aws-parallelcluster/pull/1030 Signed-off-by: Francesco De Martino --- .../remote_command_executor.py | 7 +- .../tests/cfn-init/__init__.py | 11 +++ .../tests/cfn-init/test_cfn_init.py | 74 +++++++++++++++++++ .../pcluster.config.ini | 20 +++++ .../post_install.sh | 13 ++++ 5 files changed, 123 insertions(+), 2 deletions(-) create mode 100644 tests/integration-tests/tests/cfn-init/__init__.py create mode 100644 tests/integration-tests/tests/cfn-init/test_cfn_init.py create mode 100644 tests/integration-tests/tests/cfn-init/test_cfn_init/test_replace_compute_on_failure/pcluster.config.ini create mode 100755 tests/integration-tests/tests/cfn-init/test_cfn_init/test_replace_compute_on_failure/post_install.sh diff --git a/tests/integration-tests/remote_command_executor.py b/tests/integration-tests/remote_command_executor.py index 8e14fb3aa7..46d68ef778 100644 --- a/tests/integration-tests/remote_command_executor.py +++ b/tests/integration-tests/remote_command_executor.py @@ -50,7 +50,9 @@ def __del__(self): # Catch all exceptions if we fail to close the clients logging.warning("Exception raised when closing remote ssh client: {0}".format(e)) - def run_remote_command(self, command, log_error=True, additional_files=None, raise_on_error=True, login_shell=True): + def run_remote_command( + self, command, log_error=True, additional_files=None, raise_on_error=True, login_shell=True, hide=False + ): """ Execute remote command on the cluster master node. @@ -59,6 +61,7 @@ def run_remote_command(self, command, log_error=True, additional_files=None, rai :param additional_files: additional files to copy before executing script. :param raise_on_error: if True raises a RemoteCommandExecutionError on failures :param login_shell: if True prepends /bin/bash --login -c to the given command + :param hide: do not print command output to the local stdout :return: result of the execution. """ if isinstance(command, list): @@ -68,7 +71,7 @@ def run_remote_command(self, command, log_error=True, additional_files=None, rai if login_shell: command = "/bin/bash --login -c {0}".format(shlex.quote(command)) - result = self.__connection.run(command, warn=True, pty=True, hide=False) + result = self.__connection.run(command, warn=True, pty=True, hide=hide) result.stdout = "\n".join(result.stdout.splitlines()) result.stderr = "\n".join(result.stderr.splitlines()) if result.failed and raise_on_error: diff --git a/tests/integration-tests/tests/cfn-init/__init__.py b/tests/integration-tests/tests/cfn-init/__init__.py new file mode 100644 index 0000000000..2251b11f46 --- /dev/null +++ b/tests/integration-tests/tests/cfn-init/__init__.py @@ -0,0 +1,11 @@ +# Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). +# You may not use this file except in compliance with the License. +# A copy of the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "LICENSE.txt" file accompanying this file. +# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied. +# See the License for the specific language governing permissions and limitations under the License. diff --git a/tests/integration-tests/tests/cfn-init/test_cfn_init.py b/tests/integration-tests/tests/cfn-init/test_cfn_init.py new file mode 100644 index 0000000000..0bf0387db4 --- /dev/null +++ b/tests/integration-tests/tests/cfn-init/test_cfn_init.py @@ -0,0 +1,74 @@ +# Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). +# You may not use this file except in compliance with the License. +# A copy of the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "LICENSE.txt" file accompanying this file. +# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied. +# See the License for the specific language governing permissions and limitations under the License. + +import boto3 +import pytest +from retrying import retry + +from assertpy import assert_that +from remote_command_executor import RemoteCommandExecutionError, RemoteCommandExecutor +from tests.common.schedulers_common import SlurmCommands +from time_utils import minutes, seconds + + +@pytest.mark.regions(["eu-central-1"]) +@pytest.mark.instances(["c5.xlarge"]) +@pytest.mark.schedulers(["slurm"]) +@pytest.mark.usefixtures("os", "instance", "scheduler") +def test_replace_compute_on_failure(region, pcluster_config_reader, clusters_factory, s3_bucket_factory, test_datadir): + """ + Test that compute nodes get replaced on userdata failures and logs get saved in shared directory. + + The failure is caused by a post_install script that exits with errors on compute nodes. + """ + bucket_name = s3_bucket_factory() + bucket = boto3.resource("s3", region_name=region).Bucket(bucket_name) + bucket.upload_file(str(test_datadir / "post_install.sh"), "post_install.sh") + cluster_config = pcluster_config_reader(bucket_name=bucket_name) + cluster = clusters_factory(cluster_config) + remote_command_executor = RemoteCommandExecutor(cluster) + + # submit a job to spin up a compute node that will fail due to post_install script + sge_commands = SlurmCommands(remote_command_executor) + sge_commands.submit_command("sleep 1") + instance_id = _wait_compute_log(remote_command_executor) + + # extract logs and check one of them + remote_command_executor.run_remote_command( + "tar -xf /home/logs/compute/{0}.tar.gz --directory /tmp".format(instance_id) + ) + remote_command_executor.run_remote_command("test -f /tmp/var/log/nodewatcher") + messages_log = remote_command_executor.run_remote_command("cat /tmp/var/log/messages", hide=True).stdout + assert_that(messages_log).contains( + "Reporting instance as unhealthy and dumping logs to /home/logs/compute/{0}.tar.gz".format(instance_id) + ) + + # check that instance got already replaced or is marked as Unhealthy + response = boto3.client("autoscaling", region_name=region).describe_auto_scaling_instances( + InstanceIds=[instance_id] + ) + assert_that( + not response["AutoScalingInstances"] or response["AutoScalingInstances"][0]["HealthStatus"] == "UNHEALTHY" + ).is_true() + + +@retry( + retry_on_exception=lambda exception: isinstance(exception, RemoteCommandExecutionError), + wait_fixed=seconds(30), + stop_max_delay=minutes(10), +) +def _wait_compute_log(remote_command_executor): + remote_command_executor.run_remote_command("test -d /home/logs/compute", log_error=False) + # return instance-id + return remote_command_executor.run_remote_command( + "find /home/logs/compute/ -type f -printf '%f\n' -quit | head -1 | cut -d. -f1", log_error=False + ).stdout diff --git a/tests/integration-tests/tests/cfn-init/test_cfn_init/test_replace_compute_on_failure/pcluster.config.ini b/tests/integration-tests/tests/cfn-init/test_cfn_init/test_replace_compute_on_failure/pcluster.config.ini new file mode 100644 index 0000000000..27ca7e1b1e --- /dev/null +++ b/tests/integration-tests/tests/cfn-init/test_cfn_init/test_replace_compute_on_failure/pcluster.config.ini @@ -0,0 +1,20 @@ +[global] +cluster_template = default + +[aws] +aws_region_name = {{ region }} + +[cluster default] +base_os = {{ os }} +key_name = {{ key_name }} +vpc_settings = parallelcluster-vpc +scheduler = {{ scheduler }} +master_instance_type = {{ instance }} +compute_instance_type = {{ instance }} +initial_queue_size = 0 +s3_read_resource = arn:aws:s3:::{{ bucket_name }}/* +post_install = s3://{{ bucket_name }}/post_install.sh + +[vpc parallelcluster-vpc] +vpc_id = {{ vpc_id }} +master_subnet_id = {{ public_subnet_id }} diff --git a/tests/integration-tests/tests/cfn-init/test_cfn_init/test_replace_compute_on_failure/post_install.sh b/tests/integration-tests/tests/cfn-init/test_cfn_init/test_replace_compute_on_failure/post_install.sh new file mode 100755 index 0000000000..7fe90d7d00 --- /dev/null +++ b/tests/integration-tests/tests/cfn-init/test_cfn_init/test_replace_compute_on_failure/post_install.sh @@ -0,0 +1,13 @@ +#!/bin/bash +. "/etc/parallelcluster/cfnconfig" + +case "${cfn_node_type}" in + MasterServer) + exit 0 + ;; + ComputeFleet) + exit 1 + ;; + *) + ;; +esac From 9cf1e6783109fd0c2d01c647ef41b24367471876 Mon Sep 17 00:00:00 2001 From: Francesco De Martino Date: Thu, 2 May 2019 17:48:19 +0200 Subject: [PATCH 062/121] integ tests: test compute replacement when not attached to scheduler Signed-off-by: Francesco De Martino --- .../tests/cfn-init/test_cfn_init.py | 36 ++++++------------ .../tests/common/compute_logs_common.py | 27 +++++++++++++ .../tests/common/scaling_common.py | 15 +++++++- .../tests/common/schedulers_common.py | 27 ++++++++++++- tests/integration-tests/tests/test_scaling.py | 38 ++++++++++++++++++- .../pcluster.config.ini | 19 ++++++++++ .../slurm_kill_scheduler_job.sh | 13 +++++++ 7 files changed, 146 insertions(+), 29 deletions(-) create mode 100644 tests/integration-tests/tests/common/compute_logs_common.py create mode 100644 tests/integration-tests/tests/test_scaling/test_nodewatcher_terminates_failing_node/pcluster.config.ini create mode 100755 tests/integration-tests/tests/test_scaling/test_nodewatcher_terminates_failing_node/slurm_kill_scheduler_job.sh diff --git a/tests/integration-tests/tests/cfn-init/test_cfn_init.py b/tests/integration-tests/tests/cfn-init/test_cfn_init.py index 0bf0387db4..946ab4eb8e 100644 --- a/tests/integration-tests/tests/cfn-init/test_cfn_init.py +++ b/tests/integration-tests/tests/cfn-init/test_cfn_init.py @@ -12,12 +12,12 @@ import boto3 import pytest -from retrying import retry from assertpy import assert_that -from remote_command_executor import RemoteCommandExecutionError, RemoteCommandExecutor +from remote_command_executor import RemoteCommandExecutor +from tests.common.compute_logs_common import wait_compute_log +from tests.common.scaling_common import assert_instance_replaced_or_terminating from tests.common.schedulers_common import SlurmCommands -from time_utils import minutes, seconds @pytest.mark.regions(["eu-central-1"]) @@ -40,9 +40,16 @@ def test_replace_compute_on_failure(region, pcluster_config_reader, clusters_fac # submit a job to spin up a compute node that will fail due to post_install script sge_commands = SlurmCommands(remote_command_executor) sge_commands.submit_command("sleep 1") - instance_id = _wait_compute_log(remote_command_executor) + instance_id = wait_compute_log(remote_command_executor) # extract logs and check one of them + _assert_compute_logs(remote_command_executor, instance_id) + + # check that instance got already replaced or is marked as Unhealthy + assert_instance_replaced_or_terminating(instance_id, region) + + +def _assert_compute_logs(remote_command_executor, instance_id): remote_command_executor.run_remote_command( "tar -xf /home/logs/compute/{0}.tar.gz --directory /tmp".format(instance_id) ) @@ -51,24 +58,3 @@ def test_replace_compute_on_failure(region, pcluster_config_reader, clusters_fac assert_that(messages_log).contains( "Reporting instance as unhealthy and dumping logs to /home/logs/compute/{0}.tar.gz".format(instance_id) ) - - # check that instance got already replaced or is marked as Unhealthy - response = boto3.client("autoscaling", region_name=region).describe_auto_scaling_instances( - InstanceIds=[instance_id] - ) - assert_that( - not response["AutoScalingInstances"] or response["AutoScalingInstances"][0]["HealthStatus"] == "UNHEALTHY" - ).is_true() - - -@retry( - retry_on_exception=lambda exception: isinstance(exception, RemoteCommandExecutionError), - wait_fixed=seconds(30), - stop_max_delay=minutes(10), -) -def _wait_compute_log(remote_command_executor): - remote_command_executor.run_remote_command("test -d /home/logs/compute", log_error=False) - # return instance-id - return remote_command_executor.run_remote_command( - "find /home/logs/compute/ -type f -printf '%f\n' -quit | head -1 | cut -d. -f1", log_error=False - ).stdout diff --git a/tests/integration-tests/tests/common/compute_logs_common.py b/tests/integration-tests/tests/common/compute_logs_common.py new file mode 100644 index 0000000000..a9b9de0c06 --- /dev/null +++ b/tests/integration-tests/tests/common/compute_logs_common.py @@ -0,0 +1,27 @@ +# Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance +# with the License. A copy of the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES +# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and +# limitations under the License. +from retrying import retry + +from remote_command_executor import RemoteCommandExecutionError +from time_utils import minutes, seconds + + +@retry( + retry_on_exception=lambda exception: isinstance(exception, RemoteCommandExecutionError), + wait_fixed=seconds(30), + stop_max_delay=minutes(10), +) +def wait_compute_log(remote_command_executor): + remote_command_executor.run_remote_command("test -d /home/logs/compute", log_error=False) + # return instance-id + return remote_command_executor.run_remote_command( + "find /home/logs/compute/ -type f -printf '%f\n' -quit | head -1 | cut -d. -f1", log_error=False + ).stdout diff --git a/tests/integration-tests/tests/common/scaling_common.py b/tests/integration-tests/tests/common/scaling_common.py index 4edc0e8c76..ecbe0b9eec 100644 --- a/tests/integration-tests/tests/common/scaling_common.py +++ b/tests/integration-tests/tests/common/scaling_common.py @@ -14,6 +14,7 @@ import boto3 from retrying import RetryError, retry +from assertpy import assert_that from time_utils import seconds @@ -43,7 +44,7 @@ def get_compute_nodes_allocation(scheduler_commands, region, stack_name, max_mon ) def _watch_compute_nodes_allocation(): compute_nodes = scheduler_commands.compute_nodes_count() - asg_capacity = _get_desired_asg_capacity(region, stack_name) + asg_capacity = get_desired_asg_capacity(region, stack_name) timestamp = time.time() # add values only if there is a transition. @@ -113,7 +114,7 @@ def _get_asg(region, stack_name): return response["AutoScalingGroups"][0] -def _get_desired_asg_capacity(region, stack_name): +def get_desired_asg_capacity(region, stack_name): """Retrieve the desired capacity of the autoscaling group for a specific cluster.""" return _get_asg(region, stack_name)["DesiredCapacity"] @@ -121,3 +122,13 @@ def _get_desired_asg_capacity(region, stack_name): def get_max_asg_capacity(region, stack_name): """Retrieve the max capacity of the autoscaling group for a specific cluster.""" return _get_asg(region, stack_name)["MaxSize"] + + +def assert_instance_replaced_or_terminating(instance_id, region): + """Assert that a given instance got replaced or is marked as Unhealthy.""" + response = boto3.client("autoscaling", region_name=region).describe_auto_scaling_instances( + InstanceIds=[instance_id] + ) + assert_that( + not response["AutoScalingInstances"] or response["AutoScalingInstances"][0]["LifecycleState"] == "Terminating" + ).is_true() diff --git a/tests/integration-tests/tests/common/schedulers_common.py b/tests/integration-tests/tests/common/schedulers_common.py index 8b5dabb97f..b58a77387c 100644 --- a/tests/integration-tests/tests/common/schedulers_common.py +++ b/tests/integration-tests/tests/common/schedulers_common.py @@ -8,7 +8,7 @@ # or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES # OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and # limitations under the License. - +import os import re from abc import ABCMeta, abstractmethod @@ -65,6 +65,16 @@ def submit_command(self, command, nodes=1): """ pass + @abstractmethod + def submit_script(self, script, nodes=1): + """ + Submit a job to the scheduler by using a script file. + + :param script: script to submit. + :return: result from remote command execution. + """ + pass + @abstractmethod def assert_job_succeeded(self, job_id, children_number=0): """ @@ -108,6 +118,9 @@ def assert_job_submitted(self, awsbsub_output): # noqa: D102 def submit_command(self, command, nodes=1): # noqa: D102 return self._remote_command_executor.run_remote_command('echo "{0}" | awsbsub -n {1}'.format(command, nodes)) + def submit_script(self, script, nodes=1): # noqa: D102 + raise NotImplementedError + def assert_job_succeeded(self, job_id, children_number=0): # noqa: D102 __tracebackhide__ = True status = self.get_job_exit_status(job_id) @@ -145,6 +158,9 @@ def submit_command(self, command, nodes=1): # noqa: D102 # TODO add support for multiple nodes return self._remote_command_executor.run_remote_command("echo '{0}' | qsub".format(command)) + def submit_script(self, script, nodes=1): # noqa: D102 + raise NotImplementedError + def assert_job_succeeded(self, job_id, children_number=0): # noqa: D102 __tracebackhide__ = True status = self.get_job_exit_status(job_id) @@ -182,6 +198,12 @@ def assert_job_submitted(self, sbatch_output): # noqa: D102 def submit_command(self, command, nodes=1): # noqa: D102 return self._remote_command_executor.run_remote_command("sbatch -N {0} --wrap='{1}'".format(nodes, command)) + def submit_script(self, script, nodes=1): # noqa: D102 + script_name = os.path.basename(script) + return self._remote_command_executor.run_remote_command( + "sbatch -N {0} {1}".format(nodes, script_name), additional_files=[script] + ) + def assert_job_succeeded(self, job_id, children_number=0): # noqa: D102 result = self._remote_command_executor.run_remote_command("scontrol show jobs -o {0}".format(job_id)) return "JobState=COMPLETED" in result.stdout @@ -210,6 +232,9 @@ def assert_job_submitted(self, qsub_output): # noqa: D102 def submit_command(self, command): # noqa: D102 raise NotImplementedError + def submit_script(self, script, nodes=1): # noqa: D102 + raise NotImplementedError + def assert_job_succeeded(self, job_id, children_number=0): # noqa: D102 raise NotImplementedError diff --git a/tests/integration-tests/tests/test_scaling.py b/tests/integration-tests/tests/test_scaling.py index 24e4d2d218..02df37a49d 100644 --- a/tests/integration-tests/tests/test_scaling.py +++ b/tests/integration-tests/tests/test_scaling.py @@ -15,7 +15,12 @@ from assertpy import assert_that from remote_command_executor import RemoteCommandExecutionError, RemoteCommandExecutor -from tests.common.scaling_common import get_compute_nodes_allocation +from tests.common.compute_logs_common import wait_compute_log +from tests.common.scaling_common import ( + assert_instance_replaced_or_terminating, + get_compute_nodes_allocation, + get_desired_asg_capacity, +) from tests.common.schedulers_common import get_scheduler_commands from time_utils import minutes @@ -56,6 +61,37 @@ def test_multiple_jobs_submission(scheduler, region, pcluster_config_reader, clu ) +@pytest.mark.regions(["sa-east-1"]) +@pytest.mark.instances(["c5.xlarge"]) +@pytest.mark.schedulers(["slurm"]) +@pytest.mark.usefixtures("region", "os", "instance") +@pytest.mark.nodewatcher +def test_nodewatcher_terminates_failing_node(scheduler, region, pcluster_config_reader, clusters_factory, test_datadir): + cluster_config = pcluster_config_reader() + cluster = clusters_factory(cluster_config) + remote_command_executor = RemoteCommandExecutor(cluster) + scheduler_commands = get_scheduler_commands(scheduler, remote_command_executor) + + # submit a job that kills the slurm daemon so that the node enters a failing state + scheduler_commands.submit_script(str(test_datadir / "{0}_kill_scheduler_job.sh".format(scheduler))) + instance_id = wait_compute_log(remote_command_executor) + + _assert_compute_logs(remote_command_executor, instance_id) + assert_instance_replaced_or_terminating(instance_id, region) + # verify that desired capacity is still 1 + assert_that(get_desired_asg_capacity(region, cluster.cfn_name)).is_equal_to(1) + + +def _assert_compute_logs(remote_command_executor, instance_id): + remote_command_executor.run_remote_command( + "tar -xf /home/logs/compute/{0}.tar.gz --directory /tmp".format(instance_id) + ) + remote_command_executor.run_remote_command("test -f /tmp/var/log/nodewatcher") + messages_log = remote_command_executor.run_remote_command("cat /tmp/var/log/nodewatcher", hide=True).stdout + assert_that(messages_log).contains("Node is marked as down by scheduler or not attached correctly. Terminating...") + assert_that(messages_log).contains("Dumping logs to /home/logs/compute/{0}.tar.gz".format(instance_id)) + + def _assert_scaling_works( asg_capacity_time_series, compute_nodes_time_series, expected_asg_capacity, expected_compute_nodes ): diff --git a/tests/integration-tests/tests/test_scaling/test_nodewatcher_terminates_failing_node/pcluster.config.ini b/tests/integration-tests/tests/test_scaling/test_nodewatcher_terminates_failing_node/pcluster.config.ini new file mode 100644 index 0000000000..f30fa38db5 --- /dev/null +++ b/tests/integration-tests/tests/test_scaling/test_nodewatcher_terminates_failing_node/pcluster.config.ini @@ -0,0 +1,19 @@ +[global] +cluster_template = default + +[aws] +aws_region_name = {{ region }} + +[cluster default] +base_os = {{ os }} +key_name = {{ key_name }} +vpc_settings = parallelcluster-vpc +scheduler = {{ scheduler }} +master_instance_type = {{ instance }} +compute_instance_type = {{ instance }} +initial_queue_size = 1 +maintain_initial_size = true + +[vpc parallelcluster-vpc] +vpc_id = {{ vpc_id }} +master_subnet_id = {{ public_subnet_id }} diff --git a/tests/integration-tests/tests/test_scaling/test_nodewatcher_terminates_failing_node/slurm_kill_scheduler_job.sh b/tests/integration-tests/tests/test_scaling/test_nodewatcher_terminates_failing_node/slurm_kill_scheduler_job.sh new file mode 100755 index 0000000000..b0676573e4 --- /dev/null +++ b/tests/integration-tests/tests/test_scaling/test_nodewatcher_terminates_failing_node/slurm_kill_scheduler_job.sh @@ -0,0 +1,13 @@ +#!/bin/bash +# Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). +# You may not use this file except in compliance with the License. +# A copy of the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "LICENSE.txt" file accompanying this file. +# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied. +# See the License for the specific language governing permissions and limitations under the License. +sudo kill $(ps aux | grep '[s]lurm' | awk '{print $2}') From e29bf444a8d829efa90158340dcdca0094ddcc9e Mon Sep 17 00:00:00 2001 From: Francesco De Martino Date: Tue, 7 May 2019 11:21:48 +0200 Subject: [PATCH 063/121] integ tests - cfn_init: fix log file assertion nodewatcher log is not present if failures happen during userdata Signed-off-by: Francesco De Martino --- tests/integration-tests/tests/cfn-init/test_cfn_init.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration-tests/tests/cfn-init/test_cfn_init.py b/tests/integration-tests/tests/cfn-init/test_cfn_init.py index 946ab4eb8e..b390af5e66 100644 --- a/tests/integration-tests/tests/cfn-init/test_cfn_init.py +++ b/tests/integration-tests/tests/cfn-init/test_cfn_init.py @@ -53,7 +53,7 @@ def _assert_compute_logs(remote_command_executor, instance_id): remote_command_executor.run_remote_command( "tar -xf /home/logs/compute/{0}.tar.gz --directory /tmp".format(instance_id) ) - remote_command_executor.run_remote_command("test -f /tmp/var/log/nodewatcher") + remote_command_executor.run_remote_command("test -f /tmp/var/log/cfn-init.log") messages_log = remote_command_executor.run_remote_command("cat /tmp/var/log/messages", hide=True).stdout assert_that(messages_log).contains( "Reporting instance as unhealthy and dumping logs to /home/logs/compute/{0}.tar.gz".format(instance_id) From 1c64300f2132f61b2a0200dee612f6daac530765 Mon Sep 17 00:00:00 2001 From: Sean Smith Date: Tue, 7 May 2019 16:59:08 -0700 Subject: [PATCH 064/121] Fix awsbatch stop and start Signed-off-by: Sean Smith --- cli/pcluster/pcluster.py | 29 +++++++++++++++++++++++++++-- 1 file changed, 27 insertions(+), 2 deletions(-) diff --git a/cli/pcluster/pcluster.py b/cli/pcluster/pcluster.py index 1c1615dc71..1b12c70927 100644 --- a/cli/pcluster/pcluster.py +++ b/cli/pcluster/pcluster.py @@ -328,8 +328,9 @@ def start(args): if config.parameters.get("MinSize") and int(config.parameters.get("MinSize")) > 0 else 0 ) + ce_name = get_batch_ce(stack_name, config) start_batch_ce( - ce_name=stack_name, config=config, min_vcpus=min_vcpus, desired_vcpus=desired_vcpus, max_vcpus=max_vcpus + ce_name=ce_name, config=config, min_vcpus=min_vcpus, desired_vcpus=desired_vcpus, max_vcpus=max_vcpus ) else: LOGGER.info("Starting compute fleet : %s", args.cluster_name) @@ -364,7 +365,8 @@ def stop(args): if config.parameters.get("Scheduler") == "awsbatch": LOGGER.info("Disabling AWS Batch compute environment : %s", args.cluster_name) - stop_batch_ce(ce_name=stack_name, config=config) + ce_name = get_batch_ce(stack_name, config) + stop_batch_ce(ce_name=ce_name, config=config) else: LOGGER.info("Stopping compute fleet : %s", args.cluster_name) # Set Resource limits @@ -372,6 +374,29 @@ def stop(args): set_asg_limits(asg_name=asg_name, config=config, min=0, max=0, desired=0) +def get_batch_ce(stack_name, config): + """ + Get name of the AWS Batch Compute Environment. + + :param stack_name: name of the master stack + :param config: config + :return: ce_name or exit if not found + """ + cfn = boto3.client( + "cloudformation", + region_name=config.region, + aws_access_key_id=config.aws_access_key_id, + aws_secret_access_key=config.aws_secret_access_key, + ) + + try: + outputs = cfn.describe_stacks(StackName=stack_name).get("Stacks")[0].get("Outputs") + return _get_output_value(outputs, "BatchComputeEnvironmentArn") + except ClientError as e: + LOGGER.critical(e.response.get("Error").get("Message")) + sys.exit(1) + + def get_version(stack): """ Get the version of the stack if tagged. From 5b3783268069b38f9d8bd896f3a56ccdeadff5e7 Mon Sep 17 00:00:00 2001 From: Francesco De Martino Date: Wed, 8 May 2019 18:05:37 +0200 Subject: [PATCH 065/121] integ tests - cfn_init: make log assertion work cross OSs Signed-off-by: Francesco De Martino --- .../integration-tests/tests/cfn-init/test_cfn_init.py | 10 ++++++---- .../tests/common/compute_logs_common.py | 2 +- tests/integration-tests/tests/common/scaling_common.py | 4 +++- 3 files changed, 10 insertions(+), 6 deletions(-) diff --git a/tests/integration-tests/tests/cfn-init/test_cfn_init.py b/tests/integration-tests/tests/cfn-init/test_cfn_init.py index b390af5e66..da5c587951 100644 --- a/tests/integration-tests/tests/cfn-init/test_cfn_init.py +++ b/tests/integration-tests/tests/cfn-init/test_cfn_init.py @@ -54,7 +54,9 @@ def _assert_compute_logs(remote_command_executor, instance_id): "tar -xf /home/logs/compute/{0}.tar.gz --directory /tmp".format(instance_id) ) remote_command_executor.run_remote_command("test -f /tmp/var/log/cfn-init.log") - messages_log = remote_command_executor.run_remote_command("cat /tmp/var/log/messages", hide=True).stdout - assert_that(messages_log).contains( - "Reporting instance as unhealthy and dumping logs to /home/logs/compute/{0}.tar.gz".format(instance_id) - ) + output = remote_command_executor.run_remote_command( + 'find /tmp/var/log -type f | xargs grep "Reporting instance as unhealthy and dumping logs to"', + hide=True, + login_shell=False, + ).stdout + assert_that(output).is_not_empty() diff --git a/tests/integration-tests/tests/common/compute_logs_common.py b/tests/integration-tests/tests/common/compute_logs_common.py index a9b9de0c06..df6b43abf2 100644 --- a/tests/integration-tests/tests/common/compute_logs_common.py +++ b/tests/integration-tests/tests/common/compute_logs_common.py @@ -23,5 +23,5 @@ def wait_compute_log(remote_command_executor): remote_command_executor.run_remote_command("test -d /home/logs/compute", log_error=False) # return instance-id return remote_command_executor.run_remote_command( - "find /home/logs/compute/ -type f -printf '%f\n' -quit | head -1 | cut -d. -f1", log_error=False + "find /home/logs/compute/ -type f -printf '%f\\n' -quit | head -1 | cut -d. -f1", log_error=False ).stdout diff --git a/tests/integration-tests/tests/common/scaling_common.py b/tests/integration-tests/tests/common/scaling_common.py index ecbe0b9eec..ce80cc3616 100644 --- a/tests/integration-tests/tests/common/scaling_common.py +++ b/tests/integration-tests/tests/common/scaling_common.py @@ -130,5 +130,7 @@ def assert_instance_replaced_or_terminating(instance_id, region): InstanceIds=[instance_id] ) assert_that( - not response["AutoScalingInstances"] or response["AutoScalingInstances"][0]["LifecycleState"] == "Terminating" + not response["AutoScalingInstances"] + or response["AutoScalingInstances"][0]["LifecycleState"] == "Terminating" + or response["AutoScalingInstances"][0]["HealthStatus"] == "UNHEALTHY" ).is_true() From 731b8b6aac642876af738c1213571b2fd7930270 Mon Sep 17 00:00:00 2001 From: Sean Smith Date: Thu, 9 May 2019 11:32:02 -0700 Subject: [PATCH 066/121] Typo in Upload Template Script Signed-off-by: Sean Smith --- util/uploadTemplate.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/util/uploadTemplate.sh b/util/uploadTemplate.sh index 234a204dbc..d5a6279c38 100755 --- a/util/uploadTemplate.sh +++ b/util/uploadTemplate.sh @@ -95,7 +95,7 @@ main() { echo "" echo "Done. Add the following variables to the pcluster config file, under the [cluster ...] section" echo "template_url = https://s3${_bucket_region}.amazonaws.com/${_bucket}/template/aws-parallelcluster.cfn.${_version}.json" - echo "custom_awsbatch_template_url = https://s3${_bucket_region}.amazonaws.com/${_bucket}/template/batch.cfn.json" + echo "custom_awsbatch_template_url = https://s3${_bucket_region}.amazonaws.com/${_bucket}/template/batch-substack.cfn.json" } main "$@" From efe837353cb290f39ad6ddcb2f9720b7abe8f23a Mon Sep 17 00:00:00 2001 From: Matteo Fiordarancio Date: Fri, 10 May 2019 11:03:01 +0200 Subject: [PATCH 067/121] doc: fixed a visual glitch of the getting started guide --- docs/getting_started.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/getting_started.rst b/docs/getting_started.rst index b351bbb76c..e0c97d09cf 100644 --- a/docs/getting_started.rst +++ b/docs/getting_started.rst @@ -65,10 +65,11 @@ was originally installed: Configuring AWS ParallelCluster =============================== -First you'll need to setup your IAM credentials, see `AWS CLI `_. +First you'll need to setup your IAM credentials, see `AWS CLI `_ for more information. :: + $ aws configure AWS Access Key ID [None]: AKIAIOSFODNN7EXAMPLE AWS Secret Access Key [None]: wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY From 6b3ba1ce5d023865cf2dcc28d648cacf692ccbd7 Mon Sep 17 00:00:00 2001 From: Sean Smith Date: Thu, 9 May 2019 11:47:45 -0700 Subject: [PATCH 068/121] Update outdated changes section --- README.rst | 24 ++++-------------------- 1 file changed, 4 insertions(+), 20 deletions(-) diff --git a/README.rst b/README.rst index 583643ad4d..c9df98e8ae 100644 --- a/README.rst +++ b/README.rst @@ -103,23 +103,7 @@ HPC forum which may be helpful:https://forums.aws.amazon.com/forum.jspa?forumID= Changes ------- -CfnCluster 1.6 IAM Change -========================= -Between CfnCluster 1.5.4 and 1.6.0 we made a change to the CfnClusterInstancePolicy that adds “s3:GetObject” permissions -on objects in -cfncluster bucket, "autoscaling:SetDesiredCapacity", "autoscaling:DescribeTags" permissions and -"cloudformation:DescribeStacks" permissions on ::stack/cfncluster-*. - -If you’re using a custom policy (e.g. you specify "ec2_iam_role" in your config) be sure it includes this new permission. See https://aws-parallelcluster.readthedocs.io/en/latest/iam.html - -CfnCluster 1.5 IAM Change -========================= -Between CfnCluster 1.4.2 and 1.5.0 we made a change to the CfnClusterInstancePolicy that adds “ec2:DescribeVolumes” permissions. If you’re using a custom policy (e.g. you specify "ec2_iam_role" in your config) be sure it includes this new permission. See https://aws-parallelcluster.readthedocs.io/en/latest/iam.html - -CfnCluster 1.2 and Earlier -========================== - -For various security (on our side) and maintenance reasons, CfnCluster -1.2 and earlier have been deprecated. AWS-side resources necessary to -create a cluster with CfnCluster 1.2 or earlier are no longer -available. Existing clusters will continue to operate, but new -clusters can not be created. +CfnCluster to AWS ParallelCluster +================================= +In Version `2.0.0`, we changed the name of CfnCluster to AWS ParallelCluster. With that name change we released several new features, which you can read about here: https://aws.amazon.com/blogs/opensource/aws-parallelcluster/ + From 71eac9204deb1323e9d57c55b6ccbc38574ada79 Mon Sep 17 00:00:00 2001 From: Sean Smith Date: Mon, 13 May 2019 09:05:34 -0700 Subject: [PATCH 069/121] Update Changes Section Updated link to point to CHANGELOG.rst. --- README.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.rst b/README.rst index c9df98e8ae..0020474fb1 100644 --- a/README.rst +++ b/README.rst @@ -105,5 +105,6 @@ Changes CfnCluster to AWS ParallelCluster ================================= -In Version `2.0.0`, we changed the name of CfnCluster to AWS ParallelCluster. With that name change we released several new features, which you can read about here: https://aws.amazon.com/blogs/opensource/aws-parallelcluster/ +In Version `2.0.0`, we changed the name of CfnCluster to AWS ParallelCluster. With that name change we released several new features, which you can read about in the `Change Log`_. +.. _`Change Log`: https://github.com/aws/aws-parallelcluster/blob/develop/CHANGELOG.rst#200 From c63db6d37714dbac24eb2e9f84530dc9338d4ea2 Mon Sep 17 00:00:00 2001 From: Sean Smith Date: Mon, 13 May 2019 11:38:43 -0700 Subject: [PATCH 070/121] Keep DynamoDB from updating DynamoDB (in provisioned mode) is limited to (1) update each day, preventing a `pcluster update`. To fix this, we're adding `"UpdateReplacePolicy" : "Retain",`, which disables updates to this resource. See https://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/protect-stack-resources.html Signed-off-by: Sean Smith --- cloudformation/aws-parallelcluster.cfn.json | 1 + 1 file changed, 1 insertion(+) diff --git a/cloudformation/aws-parallelcluster.cfn.json b/cloudformation/aws-parallelcluster.cfn.json index 9dfefe6b59..b037c167ab 100644 --- a/cloudformation/aws-parallelcluster.cfn.json +++ b/cloudformation/aws-parallelcluster.cfn.json @@ -1535,6 +1535,7 @@ }, "DynamoDBTable": { "Type": "AWS::DynamoDB::Table", + "UpdateReplacePolicy": "Retain", "Properties": { "AttributeDefinitions": [ { From 93ed3a46588ae3ff41f44276b5efb00199eeefe0 Mon Sep 17 00:00:00 2001 From: Sean Smith Date: Wed, 8 May 2019 13:59:05 -0700 Subject: [PATCH 071/121] Override numNodes in Job Definition Signed-off-by: Sean Smith --- cli/awsbatch/awsbsub.py | 96 +-------------- .../deregister_batch_mnp_job_definitions.py | 111 ----------------- cloudformation/batch-substack.cfn.json | 115 +----------------- 3 files changed, 6 insertions(+), 316 deletions(-) delete mode 100644 cli/pcluster/resources/batch/custom_resources_code/deregister_batch_mnp_job_definitions.py diff --git a/cli/awsbatch/awsbsub.py b/cli/awsbatch/awsbsub.py index 57791153cb..1a13c81d91 100644 --- a/cli/awsbatch/awsbsub.py +++ b/cli/awsbatch/awsbsub.py @@ -24,7 +24,7 @@ import argparse from awsbatch.common import AWSBatchCliConfig, Boto3ClientFactory, config_logger -from awsbatch.utils import S3Uploader, fail, get_job_definition_name_by_arn, shell_join +from awsbatch.utils import S3Uploader, fail, shell_join def _get_parser(): @@ -480,16 +480,13 @@ def run( # noqa: C901 FIXME } if nodes: - # Multi Node parallel submission - job_definition_version = self.__get_mnp_job_definition_version( - base_job_definition_arn=job_definition, nodes=nodes - ) - submission_args.update({"jobDefinition": job_definition_version}) + submission_args.update({"jobDefinition": job_definition}) - target_nodes = "0:%d" % (nodes - 1) + target_nodes = "0:" # populate node overrides node_overrides = { - "nodePropertyOverrides": [{"targetNodes": target_nodes, "containerOverrides": container_overrides}] + "numNodes": nodes, + "nodePropertyOverrides": [{"targetNodes": target_nodes, "containerOverrides": container_overrides}], } submission_args.update({"nodeOverrides": node_overrides}) if timeout: @@ -508,89 +505,6 @@ def run( # noqa: C901 FIXME except Exception as e: fail("Error submitting job to AWS Batch. Failed with exception: %s" % e) - def __get_mnp_job_definition_version(self, base_job_definition_arn, nodes): - """ - Get (and create if required) job definition version to use for the submission. - - :return: job definition arn - """ - # Check if there is already a job definition for the given number of nodes - job_definition_found = self.__search_for_job_definition(base_job_definition_arn, nodes) - if job_definition_found: - job_definition_arn = job_definition_found["jobDefinitionArn"] - self.log.info("Found existing Job definition (%s) with (%i) nodes" % (job_definition_arn, nodes)) - else: - self.log.info("Creating new Job definition with (%i) nodes" % nodes) - # create a new job definition revision - job_definition_arn = self.__register_new_job_definition(base_job_definition_arn, nodes) - - self.log.info("Job definition to use is (%s)" % job_definition_arn) - return job_definition_arn - - def __search_for_job_definition(self, base_job_definition, nodes): - """ - Search for existing job definition with the same name of the base_job_definition and the same number of nodes. - - :param base_job_definition: job definition arn - :param nodes: number of nodes - :return: the found jobDefinition object or None - """ - job_definition_found = None - base_job_definition_name = get_job_definition_name_by_arn(base_job_definition) - try: - next_token = "" - while next_token is not None: - response = self.batch_client.describe_job_definitions( - jobDefinitionName=base_job_definition_name, status="ACTIVE", nextToken=next_token - ) - for job_definition in response["jobDefinitions"]: - if job_definition["nodeProperties"]["numNodes"] == nodes: - job_definition_found = job_definition - break - next_token = response.get("nextToken") - except Exception as e: - fail("Error listing job definition. Failed with exception: %s" % e) - - return job_definition_found - - def __register_new_job_definition(self, base_job_definition_arn, nodes): - """ - Register a new job definition. - - It uses the base_job_definition_arn as starting point for the nodeRangeProperties. - - :param base_job_definition_arn: job definition arn to use as starting point - :param nodes: nuber of nodes to set in the job definition - :return: the ARN of the created job definition - """ - try: - # get base job definition and reuse its nodeRangeProperties - response = self.batch_client.describe_job_definitions( - jobDefinitions=[base_job_definition_arn], status="ACTIVE" - ) - job_definition = response["jobDefinitions"][0] - - # create new job definition - response = self.batch_client.register_job_definition( - jobDefinitionName=job_definition["jobDefinitionName"], - type="multinode", - nodeProperties={ - "numNodes": nodes, - "mainNode": 0, - "nodeRangeProperties": [ - { - "targetNodes": "0:%d" % (nodes - 1), - "container": job_definition["nodeProperties"]["nodeRangeProperties"][0]["container"], - } - ], - }, - ) - job_definition_arn = response["jobDefinitionArn"] - except Exception as e: - fail("Error listing job definition. Failed with exception: %s" % e) - - return job_definition_arn - def main(): """Command entrypoint.""" diff --git a/cli/pcluster/resources/batch/custom_resources_code/deregister_batch_mnp_job_definitions.py b/cli/pcluster/resources/batch/custom_resources_code/deregister_batch_mnp_job_definitions.py deleted file mode 100644 index 8de1e82c38..0000000000 --- a/cli/pcluster/resources/batch/custom_resources_code/deregister_batch_mnp_job_definitions.py +++ /dev/null @@ -1,111 +0,0 @@ -# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance with -# the License. A copy of the License is located at -# -# http://aws.amazon.com/apache2.0/ -# -# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES -# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and -# limitations under the License. -import re -import time - -import boto3 -from botocore.exceptions import ClientError - -import crhelper - -# initialise logger -logger = crhelper.log_config({"RequestId": "CONTAINER_INIT"}, loglevel="info") -logger.info("Logging configured") -# set global to track init failures -init_failed = False - -try: - # Place initialization code here - logger.info("Container initialization completed") - batch_client = boto3.client("batch") -except Exception as e: - logger.error(e, exc_info=True) - init_failed = e - - -def get_job_definition_name_by_arn(job_definition_arn): - """ - Parse Job Definition arn and get name. - - Args: - job_definition_arn: something like arn:aws:batch:::job-definition/: - - Returns: the job definition name - """ - pattern = r".*/(.*):(.*)" - return re.search(pattern, job_definition_arn).group(1) - - -def retrieve_job_definition_revisions(name): - """ - Retrieve all revisions for a given job definition. - - Args: - name: name of the job definition - - Returns: an array containing all job definition revisions ARNs - """ - next_token = "" - job_definitions = [] - while next_token is not None: - response = batch_client.describe_job_definitions(jobDefinitionName=name, nextToken=next_token, status="ACTIVE") - if "jobDefinitions" in response: - for job_definition in response["jobDefinitions"]: - job_definitions.append(job_definition["jobDefinitionArn"]) - next_token = response.get("nextToken") - # Since it's not a time critical operation, sleeping to avoid hitting API's TPS limit. - time.sleep(0.5) - - return job_definitions - - -def deregister_job_definition_revisions(name): - """ - De-register all revisions belonging to a given job definition. - - Args: - name: name of the job definition - """ - job_definitions = retrieve_job_definition_revisions(name) - for job_definition in job_definitions: - try: - logger.info("De-registering job definition: %s" % job_definition) - batch_client.deregister_job_definition(jobDefinition=job_definition) - except ClientError: - logger.warning("job definition not found: %s. It was probably manually de-registered." % job_definition) - # Since it's not a time critical operation, sleeping to avoid hitting API's TPS limit. - time.sleep(0.5) - - -def create(event, context): - """Noop.""" - return "MNPJobDefinitionCleanupHandler", {} - - -def update(event, context): - """Noop.""" - return event["MNPJobDefinitionCleanupHandler"], {} - - -def delete(event, context): - """Deregister all mnp job definitions.""" - job_definition = get_job_definition_name_by_arn(event["ResourceProperties"]["JobDefinitionMNPArn"]) - logger.info("Job definition %s deletion: STARTED" % job_definition) - deregister_job_definition_revisions(job_definition) - logger.info("Job definition %s deletion: COMPLETED" % job_definition) - - -def handler(event, context): - """Main handler function, passes off it's work to crhelper's cfn_handler.""" # noqa: D401 - # update the logger with event info - global logger - logger = crhelper.log_config(event, loglevel="info") - return crhelper.cfn_handler(event, context, create, update, delete, logger, init_failed) diff --git a/cloudformation/batch-substack.cfn.json b/cloudformation/batch-substack.cfn.json index 694d1dd74d..b7cbd97211 100644 --- a/cloudformation/batch-substack.cfn.json +++ b/cloudformation/batch-substack.cfn.json @@ -238,7 +238,6 @@ "Effect": "Allow", "Action": [ "batch:SubmitJob", - "batch:RegisterJobDefinition", "cloudformation:DescribeStacks", "ecs:ListContainerInstances", "ecs:DescribeContainerInstances", @@ -254,24 +253,7 @@ "Fn::Sub": "${JobDefinitionSerial}" }, { - "Fn::Sub": [ - "${MNPJobDefinitionArn}*", - { - "MNPJobDefinitionArn": { - "Fn::Select": [ - "0", - { - "Fn::Split": [ - ":1END_OF_THE_STRING", - { - "Fn::Sub": "${JobDefinitionMNP}END_OF_THE_STRING" - } - ] - } - ] - } - } - ] + "Fn::Sub": "${JobDefinitionMNP}" }, { "Fn::Sub": "${JobQueue}" @@ -1010,101 +992,6 @@ } ] } - }, - "DeregisterBatchMNPJobDefinitionsCustomResource": { - "Type": "AWS::CloudFormation::CustomResource", - "Properties": { - "JobDefinitionMNPArn": { - "Ref": "JobDefinitionMNP" - }, - "ServiceToken": { - "Fn::GetAtt": [ - "DeregisterBatchMNPJobDefinitionsFunction", - "Arn" - ] - } - } - }, - "DeregisterBatchMNPJobDefinitionsFunction": { - "Type": "AWS::Lambda::Function", - "Properties": { - "Code": { - "S3Bucket": { - "Ref": "ResourcesS3Bucket" - }, - "S3Key": "custom_resources_code/artifacts.zip" - }, - "Handler": "deregister_batch_mnp_job_definitions.handler", - "MemorySize": 128, - "Role": { - "Fn::GetAtt": [ - "DeregisterBatchMNPJobDefinitionsFunctionExecutionRole", - "Arn" - ] - }, - "Runtime": "python3.6", - "Timeout": 120 - } - }, - "DeregisterBatchMNPJobDefinitionsFunctionExecutionRole": { - "Type": "AWS::IAM::Role", - "Properties": { - "AssumeRolePolicyDocument": { - "Statement": [ - { - "Action": [ - "sts:AssumeRole" - ], - "Effect": "Allow", - "Principal": { - "Service": [ - { - "Fn::Sub": "lambda.${S3Url}" - } - ] - } - } - ], - "Version": "2012-10-17" - }, - "Path": "/", - "Policies": [ - { - "PolicyDocument": { - "Statement": [ - { - "Action": [ - "logs:CreateLogStream", - "logs:PutLogEvents" - ], - "Effect": "Allow", - "Resource": "arn:aws:logs:*:*:*", - "Sid": "CloudWatchLogsPolicy" - }, - { - "Action": [ - "batch:*" - ], - "Effect": "Allow", - "Resource": "*", - "Sid": "Batch" - } - ], - "Version": "2012-10-17" - }, - "PolicyName": "LambdaPolicy" - } - ] - } - }, - "DeregisterBatchMNPJobDefinitionsFunctionLogGroup": { - "Type": "AWS::Logs::LogGroup", - "Properties": { - "LogGroupName": { - "Fn::Sub": "/aws/lambda/${DeregisterBatchMNPJobDefinitionsFunction}" - }, - "RetentionInDays": 1 - } } }, "Outputs": { From 1426552d86a37f393866e89e20f386f2f62e257c Mon Sep 17 00:00:00 2001 From: Francesco De Martino Date: Wed, 15 May 2019 10:50:06 +0200 Subject: [PATCH 072/121] tox: add seed-isort-config to isort configuration Signed-off-by: Francesco De Martino --- cli/tox.ini | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/cli/tox.ini b/cli/tox.ini index 565059f727..af4a7efa4a 100644 --- a/cli/tox.ini +++ b/cli/tox.ini @@ -9,7 +9,6 @@ envlist = # Default testenv. Used to run tests on all python versions. [testenv] -# deps = -rtests/requirements.txt TODO: ADD UNIT TESTS whitelist_externals = bash deps = -rtests/requirements.txt @@ -61,7 +60,9 @@ commands = [testenv:isort] basepython = python3 skip_install = true -deps = isort +deps = + isort + seed-isort-config commands = isort -rc -w 120 \ {[vars]code_dirs} \ From dc390b176396307b1bd21e013270c608b3134d37 Mon Sep 17 00:00:00 2001 From: Francesco De Martino Date: Wed, 15 May 2019 10:51:24 +0200 Subject: [PATCH 073/121] integ tests - scaling: check there are no errors in logs Signed-off-by: Francesco De Martino --- tests/integration-tests/tests/test_scaling.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/tests/integration-tests/tests/test_scaling.py b/tests/integration-tests/tests/test_scaling.py index 02df37a49d..7df276d97d 100644 --- a/tests/integration-tests/tests/test_scaling.py +++ b/tests/integration-tests/tests/test_scaling.py @@ -60,6 +60,9 @@ def test_multiple_jobs_submission(scheduler, region, pcluster_config_reader, clu expected_compute_nodes=(0, 3), ) + logging.info("Verifying no error in logs") + _assert_no_errors_in_logs(remote_command_executor, ["/var/log/sqswatcher", "/var/log/jobwatcher"]) + @pytest.mark.regions(["sa-east-1"]) @pytest.mark.instances(["c5.xlarge"]) @@ -153,3 +156,11 @@ def _assert_test_jobs_completed(remote_command_executor, max_jobs_exec_time): jobs_execution_time = jobs_completion_time - jobs_start_time logging.info("Test jobs completed in %d seconds", jobs_execution_time) assert_that(jobs_execution_time).is_less_than(max_jobs_exec_time) + + +def _assert_no_errors_in_logs(remote_command_executor, log_files): + __tracebackhide__ = True + for log_file in log_files: + log = remote_command_executor.run_remote_command("cat {0}".format(log_file), hide=True).stdout + for error_level in ["CRITICAL", "ERROR"]: + assert_that(log).does_not_contain(error_level) From d17e561db49725d4d944f3ba8acd85d5b39c630d Mon Sep 17 00:00:00 2001 From: Enrico Usai Date: Wed, 8 May 2019 15:22:04 +0200 Subject: [PATCH 074/121] Align placeholder name to configuration parameter name Signed-off-by: Enrico Usai --- .../integration-tests/tests/update/test_update.py | 14 +++++++------- .../test_update/test_update/pcluster.config.ini | 2 +- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/tests/integration-tests/tests/update/test_update.py b/tests/integration-tests/tests/update/test_update.py index 36d5a7c2a3..401f069a59 100644 --- a/tests/integration-tests/tests/update/test_update.py +++ b/tests/integration-tests/tests/update/test_update.py @@ -21,7 +21,7 @@ from time_utils import minutes PClusterConfig = namedtuple( - "PClusterConfig", ["max_queue_size", "compute_instance", "s3_read_resource", "s3_read_write_resource"] + "PClusterConfig", ["max_queue_size", "compute_instance_type", "s3_read_resource", "s3_read_write_resource"] ) @@ -35,14 +35,14 @@ def test_update(instance, region, pcluster_config_reader, clusters_factory): """ s3_arn = "arn:aws:s3:::fake_bucket/*" init_config = PClusterConfig( - max_queue_size=5, compute_instance=instance, s3_read_resource=s3_arn, s3_read_write_resource=s3_arn + max_queue_size=5, compute_instance_type=instance, s3_read_resource=s3_arn, s3_read_write_resource=s3_arn ) cluster = _init_cluster(region, clusters_factory, pcluster_config_reader, init_config) s3_arn_updated = "arn:aws:s3:::fake_bucket/fake_folder/*" updated_config = PClusterConfig( max_queue_size=10, - compute_instance="c4.xlarge", + compute_instance_type="c4.xlarge", s3_read_resource=s3_arn_updated, s3_read_write_resource=s3_arn_updated, ) @@ -50,7 +50,7 @@ def test_update(instance, region, pcluster_config_reader, clusters_factory): # test update _test_max_queue(region, cluster.cfn_name, updated_config.max_queue_size) - _test_update_compute_instance_type(region, cluster, updated_config.compute_instance) + _test_update_compute_instance_type(region, cluster, updated_config.compute_instance_type) _test_s3_read_resource(region, cluster, updated_config.s3_read_resource) _test_s3_read_write_resource(region, cluster, updated_config.s3_read_write_resource) @@ -59,7 +59,7 @@ def _init_cluster(region, clusters_factory, pcluster_config_reader, config): # read configuration and create cluster cluster_config = pcluster_config_reader( max_queue_size=config.max_queue_size, - compute_instance=config.compute_instance, + compute_instance_type=config.compute_instance_type, s3_read_resource=config.s3_read_resource, s3_read_write_resource=config.s3_read_write_resource, ) @@ -67,7 +67,7 @@ def _init_cluster(region, clusters_factory, pcluster_config_reader, config): # Verify initial settings _test_max_queue(region, cluster.cfn_name, config.max_queue_size) - _test_compute_instance_type(region, cluster.cfn_name, config.compute_instance) + _test_compute_instance_type(region, cluster.cfn_name, config.compute_instance_type) _test_s3_read_resource(region, cluster, config.s3_read_resource) _test_s3_read_write_resource(region, cluster, config.s3_read_write_resource) @@ -77,7 +77,7 @@ def _init_cluster(region, clusters_factory, pcluster_config_reader, config): def _update_cluster(cluster, config): # change cluster.config settings _update_cluster_property(cluster, "max_queue_size", str(config.max_queue_size)) - _update_cluster_property(cluster, "compute_instance_type", config.compute_instance) + _update_cluster_property(cluster, "compute_instance_type", config.compute_instance_type) _update_cluster_property(cluster, "s3_read_resource", config.s3_read_resource) _update_cluster_property(cluster, "s3_read_write_resource", config.s3_read_write_resource) # rewrite configuration file starting from the updated cluster.config object diff --git a/tests/integration-tests/tests/update/test_update/test_update/pcluster.config.ini b/tests/integration-tests/tests/update/test_update/test_update/pcluster.config.ini index 64c827c385..9b4856870e 100644 --- a/tests/integration-tests/tests/update/test_update/test_update/pcluster.config.ini +++ b/tests/integration-tests/tests/update/test_update/test_update/pcluster.config.ini @@ -10,7 +10,7 @@ key_name = {{ key_name }} vpc_settings = parallelcluster-vpc scheduler = {{ scheduler }} master_instance_type = {{ instance }} -compute_instance_type = {{ compute_instance }} +compute_instance_type = {{ compute_instance_type }} initial_queue_size = 1 max_queue_size = {{ max_queue_size }} maintain_initial_size = true From 76b8afc2c0bc902f2bafc095589968a0fe8d22b8 Mon Sep 17 00:00:00 2001 From: Enrico Usai Date: Thu, 16 May 2019 09:36:34 +0200 Subject: [PATCH 075/121] Integ tests: add functionality to retrieve compute nodes list Signed-off-by: Enrico Usai --- .../tests/common/schedulers_common.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/tests/integration-tests/tests/common/schedulers_common.py b/tests/integration-tests/tests/common/schedulers_common.py index b58a77387c..e3a72fb5fb 100644 --- a/tests/integration-tests/tests/common/schedulers_common.py +++ b/tests/integration-tests/tests/common/schedulers_common.py @@ -130,6 +130,9 @@ def assert_job_succeeded(self, job_id, children_number=0): # noqa: D102 def compute_nodes_count(self): # noqa: D102 raise NotImplementedError + def get_compute_nodes(self): # noqa: D102 + raise NotImplementedError + class SgeCommands(SchedulerCommands): """Implement commands for sge scheduler.""" @@ -171,6 +174,9 @@ def compute_nodes_count(self): # noqa: D102 # split()[-1] to extract last line and trim whitespaces return int(result.stdout.split()[-1]) + def get_compute_nodes(self): # noqa: D102 + raise NotImplementedError + class SlurmCommands(SchedulerCommands): """Implement commands for slurm scheduler.""" @@ -213,6 +219,12 @@ def compute_nodes_count(self): # noqa: D102 # split()[-1] to extract last line and trim whitespaces return int(result.stdout.split()[-1]) + def get_compute_nodes(self): # noqa: D102 + result = self._remote_command_executor.run_remote_command( + "sinfo --Node --noheader | grep compute | awk '{print $1}'" + ) + return result.stdout.splitlines() + class TorqueCommands(SchedulerCommands): """Implement commands for torque scheduler.""" @@ -245,6 +257,9 @@ def compute_nodes_count(self): # noqa: D102 # split()[-1] to extract last line and trim whitespaces return int(result.stdout.split()[-1]) + def get_compute_nodes(self): # noqa: D102 + raise NotImplementedError + def get_scheduler_commands(scheduler, remote_command_executor): scheduler_commands = { From 84fe4b8ec739497bb5cc492124ed1c198f00c306 Mon Sep 17 00:00:00 2001 From: Enrico Usai Date: Thu, 16 May 2019 09:37:14 +0200 Subject: [PATCH 076/121] Integ tests: add functionality to submit a job in a specific node Signed-off-by: Enrico Usai --- .../tests/common/schedulers_common.py | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/tests/integration-tests/tests/common/schedulers_common.py b/tests/integration-tests/tests/common/schedulers_common.py index e3a72fb5fb..64d577c2fb 100644 --- a/tests/integration-tests/tests/common/schedulers_common.py +++ b/tests/integration-tests/tests/common/schedulers_common.py @@ -201,14 +201,19 @@ def assert_job_submitted(self, sbatch_output): # noqa: D102 assert_that(match).is_not_none() return match.group(1) - def submit_command(self, command, nodes=1): # noqa: D102 - return self._remote_command_executor.run_remote_command("sbatch -N {0} --wrap='{1}'".format(nodes, command)) + def submit_command(self, command, nodes=1, host=None): # noqa: D102 + submission_command = "sbatch -N {0} --wrap='{1}'".format(nodes, command) + if host: + submission_command += " --nodelist={0}".format(host) + return self._remote_command_executor.run_remote_command(submission_command) - def submit_script(self, script, nodes=1): # noqa: D102 + def submit_script(self, script, nodes=1, host=None): # noqa: D102 script_name = os.path.basename(script) - return self._remote_command_executor.run_remote_command( - "sbatch -N {0} {1}".format(nodes, script_name), additional_files=[script] - ) + submission_command = "sbatch" + if host: + submission_command += " --nodelist={0}".format(host) + submission_command += " -N {0} {1}".format(nodes, script_name) + return self._remote_command_executor.run_remote_command(submission_command, additional_files=[script]) def assert_job_succeeded(self, job_id, children_number=0): # noqa: D102 result = self._remote_command_executor.run_remote_command("scontrol show jobs -o {0}".format(job_id)) From 7ba4a79b4948e32960847840fb16363ea567236d Mon Sep 17 00:00:00 2001 From: Enrico Usai Date: Thu, 16 May 2019 10:10:45 +0200 Subject: [PATCH 077/121] Integ tests: add compute_root_volume_size test + create a function to add new compute nodes, because same changes will be only available on new nodes Signed-off-by: Enrico Usai --- .../tests/update/test_update.py | 77 +++++++++++++++---- .../test_update/pcluster.config.ini | 1 + 2 files changed, 65 insertions(+), 13 deletions(-) diff --git a/tests/integration-tests/tests/update/test_update.py b/tests/integration-tests/tests/update/test_update.py index 401f069a59..8c56a6dc7e 100644 --- a/tests/integration-tests/tests/update/test_update.py +++ b/tests/integration-tests/tests/update/test_update.py @@ -21,13 +21,19 @@ from time_utils import minutes PClusterConfig = namedtuple( - "PClusterConfig", ["max_queue_size", "compute_instance_type", "s3_read_resource", "s3_read_write_resource"] + "PClusterConfig", [ + "max_queue_size", + "compute_instance_type", + "compute_root_volume_size", + "s3_read_resource", + "s3_read_write_resource" + ] ) @pytest.mark.dimensions("eu-west-1", "c5.xlarge", "alinux", "slurm") @pytest.mark.usefixtures("os", "scheduler") -def test_update(instance, region, pcluster_config_reader, clusters_factory): +def test_update(instance, region, pcluster_config_reader, clusters_factory, test_datadir): """ Test 'pcluster update' command. @@ -35,31 +41,41 @@ def test_update(instance, region, pcluster_config_reader, clusters_factory): """ s3_arn = "arn:aws:s3:::fake_bucket/*" init_config = PClusterConfig( - max_queue_size=5, compute_instance_type=instance, s3_read_resource=s3_arn, s3_read_write_resource=s3_arn + max_queue_size=5, + compute_instance_type=instance, + compute_root_volume_size=30, + s3_read_resource=s3_arn, + s3_read_write_resource=s3_arn ) - cluster = _init_cluster(region, clusters_factory, pcluster_config_reader, init_config) + cluster = _init_cluster(region, test_datadir, clusters_factory, pcluster_config_reader, init_config) s3_arn_updated = "arn:aws:s3:::fake_bucket/fake_folder/*" updated_config = PClusterConfig( max_queue_size=10, compute_instance_type="c4.xlarge", + compute_root_volume_size=40, s3_read_resource=s3_arn_updated, s3_read_write_resource=s3_arn_updated, ) _update_cluster(cluster, updated_config) - # test update + # verify updated parameters _test_max_queue(region, cluster.cfn_name, updated_config.max_queue_size) - _test_update_compute_instance_type(region, cluster, updated_config.compute_instance_type) _test_s3_read_resource(region, cluster, updated_config.s3_read_resource) _test_s3_read_write_resource(region, cluster, updated_config.s3_read_write_resource) + # add compute nodes and verify compute node updated parameters + new_compute_nodes = _add_compute_nodes(cluster) + _test_compute_instance_type(region, cluster.cfn_name, updated_config.compute_instance_type) + _test_compute_root_volume_size(test_datadir, cluster, updated_config.compute_root_volume_size, new_compute_nodes[0]) -def _init_cluster(region, clusters_factory, pcluster_config_reader, config): + +def _init_cluster(region, test_datadir, clusters_factory, pcluster_config_reader, config): # read configuration and create cluster cluster_config = pcluster_config_reader( max_queue_size=config.max_queue_size, compute_instance_type=config.compute_instance_type, + compute_root_volume_size=config.compute_root_volume_size, s3_read_resource=config.s3_read_resource, s3_read_write_resource=config.s3_read_write_resource, ) @@ -67,10 +83,13 @@ def _init_cluster(region, clusters_factory, pcluster_config_reader, config): # Verify initial settings _test_max_queue(region, cluster.cfn_name, config.max_queue_size) - _test_compute_instance_type(region, cluster.cfn_name, config.compute_instance_type) _test_s3_read_resource(region, cluster, config.s3_read_resource) _test_s3_read_write_resource(region, cluster, config.s3_read_write_resource) + # Verify compute node initial settings + _test_compute_instance_type(region, cluster.cfn_name, config.compute_instance_type) + _test_compute_root_volume_size(test_datadir, cluster, config.compute_root_volume_size) + return cluster @@ -78,6 +97,7 @@ def _update_cluster(cluster, config): # change cluster.config settings _update_cluster_property(cluster, "max_queue_size", str(config.max_queue_size)) _update_cluster_property(cluster, "compute_instance_type", config.compute_instance_type) + _update_cluster_property(cluster, "compute_root_volume_size", str(config.compute_root_volume_size)) _update_cluster_property(cluster, "s3_read_resource", config.s3_read_resource) _update_cluster_property(cluster, "s3_read_write_resource", config.s3_read_write_resource) # rewrite configuration file starting from the updated cluster.config object @@ -96,12 +116,22 @@ def _test_max_queue(region, stack_name, queue_size): assert_that(asg_max_size).is_equal_to(queue_size) -def _test_update_compute_instance_type(region, cluster, new_compute_instance): - # submit a job to perform a scaling up action and have a new instance - number_of_nodes = 2 +def _add_compute_nodes(cluster, number_of_nodes=1): + """ + Add new compute nodes to the cluster. + + It is required because some changes will be available only on new compute nodes. + :param cluster: the cluster + :param number_of_nodes: number of nodes to add + :return an array containing the new compute nodes only + """ remote_command_executor = RemoteCommandExecutor(cluster) slurm_commands = SlurmCommands(remote_command_executor) - result = slurm_commands.submit_command("sleep 60", nodes=number_of_nodes) + initial_compute_nodes = slurm_commands.get_compute_nodes() + + number_of_nodes = len(initial_compute_nodes) + number_of_nodes + # submit a job to perform a scaling up action and have new instances + result = slurm_commands.submit_command("sleep 1", nodes=number_of_nodes) slurm_commands.assert_job_submitted(result.stdout) estimated_scaleup_time = 5 @@ -110,7 +140,8 @@ def _test_update_compute_instance_type(region, cluster, new_compute_instance): max_monitoring_time=minutes(estimated_scaleup_time), number_of_nodes=number_of_nodes, ) - _test_compute_instance_type(region, cluster.cfn_name, new_compute_instance) + + return [node for node in slurm_commands.get_compute_nodes() if node not in initial_compute_nodes] def _test_compute_instance_type(region, stack_name, compute_instance_type): @@ -122,6 +153,26 @@ def _test_compute_instance_type(region, stack_name, compute_instance_type): assert_that(instance_types).contains(compute_instance_type) +def _test_compute_root_volume_size(test_datadir, cluster, compute_root_volume_size, host=None): + remote_command_executor = RemoteCommandExecutor(cluster) + slurm_commands = SlurmCommands(remote_command_executor) + if host: + compute_node = host + else: + nodes = slurm_commands.get_compute_nodes() + compute_node = nodes[0] if len(nodes) > 0 else None + + # submit a job to retrieve compute root volume size and save in a file + result = slurm_commands.submit_script(str(test_datadir / "slurm_get_root_volume_size.sh"), host=compute_node) + job_id = slurm_commands.assert_job_submitted(result.stdout) + slurm_commands.wait_job_completed(job_id) + slurm_commands.assert_job_succeeded(job_id) + + # read volume size from file + result = remote_command_executor.run_remote_command("cat /shared/{0}_root_volume_size.txt".format(compute_node)) + assert_that(result.stdout).matches(r"{size}G".format(size=compute_root_volume_size)) + + def _test_policy_statement(region, cluster, policy_name, policy_statement): iam_client = boto3.client("iam", region_name=region) root_role = cluster.cfn_resources.get("RootRole") diff --git a/tests/integration-tests/tests/update/test_update/test_update/pcluster.config.ini b/tests/integration-tests/tests/update/test_update/test_update/pcluster.config.ini index 9b4856870e..dd4b2d9b49 100644 --- a/tests/integration-tests/tests/update/test_update/test_update/pcluster.config.ini +++ b/tests/integration-tests/tests/update/test_update/test_update/pcluster.config.ini @@ -11,6 +11,7 @@ vpc_settings = parallelcluster-vpc scheduler = {{ scheduler }} master_instance_type = {{ instance }} compute_instance_type = {{ compute_instance_type }} +compute_root_volume_size = {{ compute_root_volume_size }} initial_queue_size = 1 max_queue_size = {{ max_queue_size }} maintain_initial_size = true From 0d70296184b04d11b1725bd2d8dda26dc4d4ff65 Mon Sep 17 00:00:00 2001 From: Enrico Usai Date: Thu, 16 May 2019 11:08:39 +0200 Subject: [PATCH 078/121] Integ tests: verify updated params on both OLD and NEW compute nodes + Improve test_compute_instance_type to check a specific node Signed-off-by: Enrico Usai --- .../tests/update/test_update.py | 77 +++++++++++-------- .../test_update/slurm_get_root_volume_size.sh | 13 ++++ 2 files changed, 59 insertions(+), 31 deletions(-) create mode 100644 tests/integration-tests/tests/update/test_update/test_update/slurm_get_root_volume_size.sh diff --git a/tests/integration-tests/tests/update/test_update.py b/tests/integration-tests/tests/update/test_update.py index 8c56a6dc7e..f44b648c7e 100644 --- a/tests/integration-tests/tests/update/test_update.py +++ b/tests/integration-tests/tests/update/test_update.py @@ -9,6 +9,7 @@ # or in the "LICENSE.txt" file accompanying this file. # This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied. # See the License for the specific language governing permissions and limitations under the License. +import time from collections import namedtuple import boto3 @@ -21,13 +22,14 @@ from time_utils import minutes PClusterConfig = namedtuple( - "PClusterConfig", [ + "PClusterConfig", + [ "max_queue_size", "compute_instance_type", "compute_root_volume_size", "s3_read_resource", - "s3_read_write_resource" - ] + "s3_read_write_resource", + ], ) @@ -45,9 +47,13 @@ def test_update(instance, region, pcluster_config_reader, clusters_factory, test compute_instance_type=instance, compute_root_volume_size=30, s3_read_resource=s3_arn, - s3_read_write_resource=s3_arn + s3_read_write_resource=s3_arn, ) - cluster = _init_cluster(region, test_datadir, clusters_factory, pcluster_config_reader, init_config) + cluster = _init_cluster(clusters_factory, pcluster_config_reader, init_config) + command_executor = RemoteCommandExecutor(cluster) + slurm_commands = SlurmCommands(command_executor) + + _verify_initialization(command_executor, slurm_commands, region, test_datadir, cluster, init_config) s3_arn_updated = "arn:aws:s3:::fake_bucket/fake_folder/*" updated_config = PClusterConfig( @@ -64,13 +70,21 @@ def test_update(instance, region, pcluster_config_reader, clusters_factory, test _test_s3_read_resource(region, cluster, updated_config.s3_read_resource) _test_s3_read_write_resource(region, cluster, updated_config.s3_read_write_resource) - # add compute nodes and verify compute node updated parameters - new_compute_nodes = _add_compute_nodes(cluster) - _test_compute_instance_type(region, cluster.cfn_name, updated_config.compute_instance_type) - _test_compute_root_volume_size(test_datadir, cluster, updated_config.compute_root_volume_size, new_compute_nodes[0]) + # verify params that are NOT updated in OLD compute nodes + compute_nodes = slurm_commands.get_compute_nodes() + _test_compute_instance_type(region, cluster.cfn_name, init_config.compute_instance_type, compute_nodes[0]) + _test_compute_root_volume_size( + command_executor, slurm_commands, test_datadir, init_config.compute_root_volume_size, compute_nodes[0] + ) + # add compute nodes and verify updated params in NEW compute nodes + new_compute_nodes = _add_compute_nodes(slurm_commands) + _test_compute_instance_type(region, cluster.cfn_name, updated_config.compute_instance_type, new_compute_nodes[0]) + _test_compute_root_volume_size( + command_executor, slurm_commands, test_datadir, updated_config.compute_root_volume_size, new_compute_nodes[0] + ) -def _init_cluster(region, test_datadir, clusters_factory, pcluster_config_reader, config): +def _init_cluster(clusters_factory, pcluster_config_reader, config): # read configuration and create cluster cluster_config = pcluster_config_reader( max_queue_size=config.max_queue_size, @@ -80,17 +94,21 @@ def _init_cluster(region, test_datadir, clusters_factory, pcluster_config_reader s3_read_write_resource=config.s3_read_write_resource, ) cluster = clusters_factory(cluster_config) + return cluster + +def _verify_initialization(command_executor, slurm_commands, region, test_datadir, cluster, config): # Verify initial settings _test_max_queue(region, cluster.cfn_name, config.max_queue_size) _test_s3_read_resource(region, cluster, config.s3_read_resource) _test_s3_read_write_resource(region, cluster, config.s3_read_write_resource) - # Verify compute node initial settings - _test_compute_instance_type(region, cluster.cfn_name, config.compute_instance_type) - _test_compute_root_volume_size(test_datadir, cluster, config.compute_root_volume_size) - - return cluster + # Verify Compute nodes initial settings + compute_nodes = slurm_commands.get_compute_nodes() + _test_compute_instance_type(region, cluster.cfn_name, config.compute_instance_type, compute_nodes[0]) + _test_compute_root_volume_size( + command_executor, slurm_commands, test_datadir, config.compute_root_volume_size, compute_nodes[0] + ) def _update_cluster(cluster, config): @@ -116,7 +134,7 @@ def _test_max_queue(region, stack_name, queue_size): assert_that(asg_max_size).is_equal_to(queue_size) -def _add_compute_nodes(cluster, number_of_nodes=1): +def _add_compute_nodes(slurm_commands, number_of_nodes=1): """ Add new compute nodes to the cluster. @@ -125,8 +143,6 @@ def _add_compute_nodes(cluster, number_of_nodes=1): :param number_of_nodes: number of nodes to add :return an array containing the new compute nodes only """ - remote_command_executor = RemoteCommandExecutor(cluster) - slurm_commands = SlurmCommands(remote_command_executor) initial_compute_nodes = slurm_commands.get_compute_nodes() number_of_nodes = len(initial_compute_nodes) + number_of_nodes @@ -144,32 +160,31 @@ def _add_compute_nodes(cluster, number_of_nodes=1): return [node for node in slurm_commands.get_compute_nodes() if node not in initial_compute_nodes] -def _test_compute_instance_type(region, stack_name, compute_instance_type): +def _test_compute_instance_type(region, stack_name, compute_instance_type, host): + hostname = "{0}.{1}.compute.internal".format(host, region) ec2_resource = boto3.resource("ec2", region_name=region) instance_types = [] - for instance in ec2_resource.instances.filter(Filters=[{"Name": "tag:Application", "Values": [stack_name]}]): + for instance in ec2_resource.instances.filter( + Filters=[ + {"Name": "tag:Application", "Values": [stack_name]}, + {"Name": "private-dns-name", "Values": [hostname]}, + ] + ): instance_types.append(instance.instance_type) assert_that(instance_types).contains(compute_instance_type) -def _test_compute_root_volume_size(test_datadir, cluster, compute_root_volume_size, host=None): - remote_command_executor = RemoteCommandExecutor(cluster) - slurm_commands = SlurmCommands(remote_command_executor) - if host: - compute_node = host - else: - nodes = slurm_commands.get_compute_nodes() - compute_node = nodes[0] if len(nodes) > 0 else None - +def _test_compute_root_volume_size(command_executor, slurm_commands, test_datadir, compute_root_volume_size, host): # submit a job to retrieve compute root volume size and save in a file - result = slurm_commands.submit_script(str(test_datadir / "slurm_get_root_volume_size.sh"), host=compute_node) + result = slurm_commands.submit_script(str(test_datadir / "slurm_get_root_volume_size.sh"), host=host) job_id = slurm_commands.assert_job_submitted(result.stdout) slurm_commands.wait_job_completed(job_id) slurm_commands.assert_job_succeeded(job_id) # read volume size from file - result = remote_command_executor.run_remote_command("cat /shared/{0}_root_volume_size.txt".format(compute_node)) + time.sleep(5) # wait a bit to be sure to have the file + result = command_executor.run_remote_command("cat /shared/{0}_root_volume_size.txt".format(host)) assert_that(result.stdout).matches(r"{size}G".format(size=compute_root_volume_size)) diff --git a/tests/integration-tests/tests/update/test_update/test_update/slurm_get_root_volume_size.sh b/tests/integration-tests/tests/update/test_update/test_update/slurm_get_root_volume_size.sh new file mode 100644 index 0000000000..225dca8feb --- /dev/null +++ b/tests/integration-tests/tests/update/test_update/test_update/slurm_get_root_volume_size.sh @@ -0,0 +1,13 @@ +#!/bin/bash +# Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). +# You may not use this file except in compliance with the License. +# A copy of the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "LICENSE.txt" file accompanying this file. +# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied. +# See the License for the specific language governing permissions and limitations under the License. +df -h | grep '/$' | awk '{{print $2}}' > /shared/$(hostname)_root_volume_size.txt From 946d6ced2e3b34852d6a41067f7af88e13a97029 Mon Sep 17 00:00:00 2001 From: Luca Carrogu Date: Mon, 20 May 2019 15:06:18 +0200 Subject: [PATCH 079/121] Avoid c4.xlarge to be launched in ap-northeast-2b (not supported) Signed-off-by: Luca Carrogu --- tests/integration-tests/conftest.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/integration-tests/conftest.py b/tests/integration-tests/conftest.py index 3c16fbd50c..d6cdea0d60 100644 --- a/tests/integration-tests/conftest.py +++ b/tests/integration-tests/conftest.py @@ -284,6 +284,8 @@ def cfn_stacks_factory(): "us-west-2": ["us-west-2a", "us-west-2b", "us-west-2c"], # c5.xlarge is not supported in ap-southeast-2a "ap-southeast-2": ["ap-southeast-2b", "ap-southeast-2c"], + # c4.xlarge is not supported in ap-northeast-2b + "ap-northeast-2": ["ap-northeast-2a", "ap-northeast-2c"], } From 753bb68f3facbabe8a07434f1bbaed58675ed3ad Mon Sep 17 00:00:00 2001 From: Francesco De Martino Date: Tue, 21 May 2019 11:39:28 +0200 Subject: [PATCH 080/121] integ tests: blacklist ap-southeast-1c due to c5.xlarge unavailability Signed-off-by: Francesco De Martino --- tests/integration-tests/conftest.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/integration-tests/conftest.py b/tests/integration-tests/conftest.py index d6cdea0d60..f144cc9871 100644 --- a/tests/integration-tests/conftest.py +++ b/tests/integration-tests/conftest.py @@ -286,6 +286,8 @@ def cfn_stacks_factory(): "ap-southeast-2": ["ap-southeast-2b", "ap-southeast-2c"], # c4.xlarge is not supported in ap-northeast-2b "ap-northeast-2": ["ap-northeast-2a", "ap-northeast-2c"], + # c5.xlarge is not supported in ap-southeast-1c + "ap-southeast-1": ["ap-southeast-1a", "ap-southeast-1b"], } From 5249659d7fb10588acb6d31d3e71a0a13ac2d2ba Mon Sep 17 00:00:00 2001 From: Matteo Fiordarancio Date: Thu, 16 May 2019 16:44:08 +0200 Subject: [PATCH 081/121] doc: update network configuration for compute_subnet_cidr case I verified with a test that for compute_subnet_cidr no Nat is needed --- docs/networking.rst | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/docs/networking.rst b/docs/networking.rst index 3c1e0f3274..d0863f7ad5 100644 --- a/docs/networking.rst +++ b/docs/networking.rst @@ -47,6 +47,7 @@ The configuration to create a new private subnet for compute instances requires vpc_id = vpc-xxxxxx master_subnet_id = subnet- compute_subnet_cidr = 10.0.1.0/24 + use_public_ips = true The configuration to use an existing private network requires the following settings: @@ -57,9 +58,10 @@ The configuration to use an existing private network requires the following sett master_subnet_id = subnet- compute_subnet_id = subnet- -Both these configuration require to have a `NAT Gateway -`_ -or an internal PROXY to enable web access for compute instances. +.. note:: + This second configuration requires a `NAT Gateway + `_ + or an internal PROXY to enable web access for compute instances. AWS ParallelCluster in a single private subnet connected using Direct Connect ----------------------------------------------------------------------------- From da5ba90e51ef029061821b9058e3cdbfef72f607 Mon Sep 17 00:00:00 2001 From: Francesco De Martino Date: Wed, 22 May 2019 11:02:31 +0200 Subject: [PATCH 082/121] integ tests: blacklist ap-sout-1c due to c4.xlarge unavailability Signed-off-by: Francesco De Martino --- tests/integration-tests/conftest.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/integration-tests/conftest.py b/tests/integration-tests/conftest.py index f144cc9871..8cb2908980 100644 --- a/tests/integration-tests/conftest.py +++ b/tests/integration-tests/conftest.py @@ -288,6 +288,8 @@ def cfn_stacks_factory(): "ap-northeast-2": ["ap-northeast-2a", "ap-northeast-2c"], # c5.xlarge is not supported in ap-southeast-1c "ap-southeast-1": ["ap-southeast-1a", "ap-southeast-1b"], + # c4.xlarge is not supported in ap-south-1c + "ap-south-1": ["ap-south-1a", "ap-south-1b"], } From 69d0ef25c4554a8d393d7a6e73deb8b1bb0116d7 Mon Sep 17 00:00:00 2001 From: Matteo Fiordarancio Date: Fri, 17 May 2019 13:15:29 +0200 Subject: [PATCH 083/121] update sanity check to solve cidr_value and public_ips conflicts I verified with a test that for compute_subnet_cidr public_ips has to be set to true, so I added the check into the code. --- cli/pcluster/cfnconfig.py | 5 +++++ cli/pcluster/config_sanity.py | 11 +++++++++++ 2 files changed, 16 insertions(+) diff --git a/cli/pcluster/cfnconfig.py b/cli/pcluster/cfnconfig.py index c5f43dda54..7b2e82fb9a 100644 --- a/cli/pcluster/cfnconfig.py +++ b/cli/pcluster/cfnconfig.py @@ -354,6 +354,11 @@ def __init_vpc_parameters(self): "VPC section [%s] used in [%s] section is not defined" % (vpc_section, self.__cluster_section) ) + # Check that cidr and public ips are not both set + cidr_value = self.__config.get(vpc_section, "compute_subnet_cidr", fallback=None) + public_ips = self.__config.getboolean(vpc_section, "use_public_ips", fallback=True) + ResourceValidator.validate_vpc_coherence(cidr_value, public_ips) + def __check_account_capacity(self): """Try to launch the requested number of instances to verify Account limits.""" if self.parameters.get("Scheduler") == "awsbatch" or self.parameters.get("ClusterType", "ondemand") == "spot": diff --git a/cli/pcluster/config_sanity.py b/cli/pcluster/config_sanity.py index 94827927bd..f31f6f8db9 100644 --- a/cli/pcluster/config_sanity.py +++ b/cli/pcluster/config_sanity.py @@ -46,6 +46,17 @@ def __get_partition(self): return "aws-us-gov" return "aws" + @staticmethod + def validate_vpc_coherence(cidr_value, public_ip): + """ + Check that cidr_value and public_ip parameters are not conflicting. + + :param cidr_value: the value of compute_subnet_cidr set by the user (default should be None) + :param public_ip: the value of use_public_ips set by the user (default should be True) + """ + if cidr_value and public_ip is False: + ResourceValidator.__fail("VPC COHERENCE", "compute_subnet_cidr needs use_public_ips to be true") + @staticmethod def __check_sg_rules_for_port(rule, port_to_check): """ From b0f1771b0bd195202a13c176e7485a627f28d0ab Mon Sep 17 00:00:00 2001 From: Matteo Fiordarancio Date: Fri, 17 May 2019 15:42:42 +0200 Subject: [PATCH 084/121] doc: updated configuration doc for use_public_ip with a note on cidr --- docs/configuration.rst | 3 +++ 1 file changed, 3 insertions(+) diff --git a/docs/configuration.rst b/docs/configuration.rst index 4b3a7127e3..3c9c61d0c4 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -593,6 +593,9 @@ If true, an Elastic IP will be associated to the Master instance. If false, the Master instance will have a Public IP (or not) according to the value of the "Auto-assign Public IP" subnet configuration parameter. +.. note:: + This parameter can't be set to false if :code:`compute_subnet_cidr` is specified. + See :ref:`networking configuration ` for some examples. Defaults to true. :: From c68ff0e70fbae584d6ba04d6f5be2fd0cb5fe22d Mon Sep 17 00:00:00 2001 From: Luca Carrogu Date: Thu, 23 May 2019 13:12:10 +0200 Subject: [PATCH 085/121] Change download URL for CloudFormation Helper Scripts package This will add support for Ubuntu in China NorthWest region (cn-northwest-1) Signed-off-by: Luca Carrogu --- CHANGELOG.rst | 7 +++++++ cloudformation/aws-parallelcluster.cfn.json | 6 +++--- docs/configuration.rst | 6 ++---- tests/integration-tests/conftest_markers.py | 2 -- 4 files changed, 12 insertions(+), 9 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 15f0fad6e7..f8b2677231 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -2,6 +2,13 @@ CHANGELOG ========= +2.4.0 +===== + +**ENHANCEMENTS** + +* Add support for Ubuntu in China region `cn-northwest-1` + 2.3.1 ===== diff --git a/cloudformation/aws-parallelcluster.cfn.json b/cloudformation/aws-parallelcluster.cfn.json index b037c167ab..8b7d460fe7 100644 --- a/cloudformation/aws-parallelcluster.cfn.json +++ b/cloudformation/aws-parallelcluster.cfn.json @@ -2215,7 +2215,7 @@ " if [ \"${apt}\" == \"0\" ]; then\n", " apt-cache search build-essential; apt-get clean; apt-get update; apt-get -y install build-essential curl wget jq\n", " fi\n", - " [[ ${_region} =~ ^cn- ]] && s3_url=\"cn-north-1.amazonaws.com.cn\"", + " [[ ${_region} =~ ^cn- ]] && s3_url=\"cn-north-1.amazonaws.com.cn/cn-north-1-aws-parallelcluster\"", " which cfn-init 2>/dev/null || ( curl -s -L -o /tmp/aws-cfn-bootstrap-latest.tar.gz https://s3.${s3_url}/cloudformation-examples/aws-cfn-bootstrap-latest.tar.gz; easy_install -U /tmp/aws-cfn-bootstrap-latest.tar.gz)\n", " mkdir -p /etc/chef && chown -R root:root /etc/chef\n", " curl --retry 3 -L https://www.chef.io/chef/install.sh | bash -s -- -v ${chef_version}\n", @@ -3053,7 +3053,7 @@ " if [ \"${apt}\" == \"0\" ]; then\n", " apt-cache search build-essential; apt-get clean; apt-get update; apt-get -y install build-essential curl wget jq\n", " fi\n", - " [[ ${_region} =~ ^cn- ]] && s3_url=\"cn-north-1.amazonaws.com.cn\"", + " [[ ${_region} =~ ^cn- ]] && s3_url=\"cn-north-1.amazonaws.com.cn/cn-north-1-aws-parallelcluster\"", " which cfn-init 2>/dev/null || ( curl -s -L -o /tmp/aws-cfn-bootstrap-latest.tar.gz https://s3.${s3_url}/cloudformation-examples/aws-cfn-bootstrap-latest.tar.gz; easy_install -U /tmp/aws-cfn-bootstrap-latest.tar.gz)\n", " mkdir -p /etc/chef && chown -R root:root /etc/chef\n", " curl --retry 3 -L https://www.chef.io/chef/install.sh | bash -s -- -v ${chef_version}\n", @@ -3724,7 +3724,7 @@ " if [ \"${apt}\" == \"0\" ]; then\n", " apt-cache search build-essential; apt-get clean; apt-get update; apt-get -y install build-essential curl wget jq\n", " fi\n", - " [[ ${_region} =~ ^cn- ]] && s3_url=\"cn-north-1.amazonaws.com.cn\"", + " [[ ${_region} =~ ^cn- ]] && s3_url=\"cn-north-1.amazonaws.com.cn/cn-north-1-aws-parallelcluster\"", " which cfn-init 2>/dev/null || ( curl -s -L -o /tmp/aws-cfn-bootstrap-latest.tar.gz https://s3.${s3_url}/cloudformation-examples/aws-cfn-bootstrap-latest.tar.gz; easy_install -U /tmp/aws-cfn-bootstrap-latest.tar.gz)\n", " mkdir -p /etc/chef && chown -R root:root /etc/chef\n", " curl --retry 3 -L https://www.chef.io/chef/install.sh | bash -s -- -v ${chef_version}\n", diff --git a/docs/configuration.rst b/docs/configuration.rst index 3c9c61d0c4..e9adafe6cc 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -419,10 +419,8 @@ that commercial entails all supported regions including us-east-1, us-west-2, et region alinux centos6 centos7 ubuntu1404 ubuntu1604 ============== ====== ============ ============ ============= ============ commercial True True True True True - us-gov-west-1 True False False True True - us-gov-east-1 True False False True True - cn-north-1 True False False True True - cn-northwest-1 True False False False False + govcloud True False False True True + china True False False True True ============== ====== ============ ============ ============= ============ Note: The base_os determines the username used to log into the cluster. diff --git a/tests/integration-tests/conftest_markers.py b/tests/integration-tests/conftest_markers.py index 3b1a1f7192..5e7115abba 100644 --- a/tests/integration-tests/conftest_markers.py +++ b/tests/integration-tests/conftest_markers.py @@ -26,8 +26,6 @@ ("us-gov-west-1", "*", "*", "awsbatch"), ("us-gov-east-1", "*", "*", "awsbatch"), ("us-gov-east-1", "*", "c4.xlarge", "*"), - ("cn-northwest-1", "*", "ubuntu1404", "*"), # aws-cfn-bootstrap missing in Ningxia region - ("cn-northwest-1", "*", "ubuntu1604", "*"), # aws-cfn-bootstrap missing in Ningxia region ] From f424830548df59e820b27fc902ca7205eaffebb9 Mon Sep 17 00:00:00 2001 From: Matteo Fiordarancio Date: Fri, 24 May 2019 11:46:47 +0200 Subject: [PATCH 086/121] fixed cfn-lint errors on Types --- cloudformation/ebs-substack.cfn.json | 2 +- cloudformation/efs-substack.cfn.json | 2 +- cloudformation/raid-substack.cfn.json | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/cloudformation/ebs-substack.cfn.json b/cloudformation/ebs-substack.cfn.json index a2db553425..a25089a8ce 100644 --- a/cloudformation/ebs-substack.cfn.json +++ b/cloudformation/ebs-substack.cfn.json @@ -1049,7 +1049,7 @@ "Parameters": { "AvailabilityZone": { "Description": "Availability Zone the cluster will launch into. THIS IS REQUIRED", - "Type": "String" + "Type": "AWS::EC2::AvailabilityZone::Name" }, "EBSEncryption": { "Description": "Boolean flag to use EBS encryption for /shared volume. (Not to be used for snapshots)", diff --git a/cloudformation/efs-substack.cfn.json b/cloudformation/efs-substack.cfn.json index d94bd9d5f7..6f9a10b3dd 100644 --- a/cloudformation/efs-substack.cfn.json +++ b/cloudformation/efs-substack.cfn.json @@ -201,7 +201,7 @@ "Parameters": { "ComputeSecurityGroup": { "Description": "SecurityGroup for Mount Target", - "Type": "String" + "Type": "AWS::EC2::SecurityGroup::Id" }, "EFSOptions": { "Description": "Comma separated list of efs related options, 8 parameters in total", diff --git a/cloudformation/raid-substack.cfn.json b/cloudformation/raid-substack.cfn.json index 63159ef4e7..006139de7f 100644 --- a/cloudformation/raid-substack.cfn.json +++ b/cloudformation/raid-substack.cfn.json @@ -641,7 +641,7 @@ "Parameters": { "AvailabilityZone": { "Description": "Availability Zone the cluster will launch into. THIS IS REQUIRED", - "Type": "String" + "Type": "AWS::EC2::AvailabilityZone::Name" }, "RAIDOptions": { "Description": "Comma separated list of RAID related options, 8 parameters in total, [0 shared_dir,1 raid_type,2 num_of_vols,3 vol_type,4 vol_size,5 vol_IOPS,6 encrypted, 7 ebs_kms_key]", From 2cd2f8da3c5f3cf3771cb7528c2084d50ec36f67 Mon Sep 17 00:00:00 2001 From: Luca Carrogu Date: Thu, 23 May 2019 17:55:54 +0200 Subject: [PATCH 087/121] Bump version to 2.4.0 Signed-off-by: Luca Carrogu --- cli/setup.py | 2 +- cloudformation/aws-parallelcluster.cfn.json | 4 ++-- docs/conf.py | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/cli/setup.py b/cli/setup.py index 02c009e035..857e19ba1e 100644 --- a/cli/setup.py +++ b/cli/setup.py @@ -21,7 +21,7 @@ def readme(): return f.read() -VERSION = "2.3.2a1" +VERSION = "2.4.0" REQUIRES = ["boto3>=1.9.54", "future>=0.16.0,<=0.17.1", "tabulate>=0.8.2,<=0.8.3"] if sys.version_info[:2] == (2, 6): diff --git a/cloudformation/aws-parallelcluster.cfn.json b/cloudformation/aws-parallelcluster.cfn.json index 8b7d460fe7..7576046939 100644 --- a/cloudformation/aws-parallelcluster.cfn.json +++ b/cloudformation/aws-parallelcluster.cfn.json @@ -1,6 +1,6 @@ { "AWSTemplateFormatVersion": "2010-09-09", - "Description": "AWS ParallelCluster Template. Version: aws-parallelcluster-2.3.2a1", + "Description": "AWS ParallelCluster Template. Version: aws-parallelcluster-2.4.0", "Metadata": { "AWS::CloudFormation::Interface": { "ParameterGroups": [ @@ -1372,7 +1372,7 @@ }, "PackagesVersions": { "default": { - "parallelcluster": "2.3.2a1", + "parallelcluster": "2.4.0", "cookbook": "aws-parallelcluster-cookbook-2.3.2", "chef": "14.2.0", "ridley": "5.1.1", diff --git a/docs/conf.py b/docs/conf.py index 95b1cb3a9e..60cb23c822 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -51,9 +51,9 @@ # built documents. # # The short X.Y version. -version = '2.3' +version = '2.4' # The full version, including alpha/beta/rc tags. -release = '2.3.2a1' +release = '2.4.0' # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. From 16e9cff36c2e3fab4fc3cdc4ba52182667c8a31f Mon Sep 17 00:00:00 2001 From: Luca Carrogu Date: Sat, 25 May 2019 11:30:37 +0200 Subject: [PATCH 088/121] Bump version to 2.4.0 Signed-off-by: Luca Carrogu --- cloudformation/aws-parallelcluster.cfn.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cloudformation/aws-parallelcluster.cfn.json b/cloudformation/aws-parallelcluster.cfn.json index 7576046939..1844c3ff18 100644 --- a/cloudformation/aws-parallelcluster.cfn.json +++ b/cloudformation/aws-parallelcluster.cfn.json @@ -1373,7 +1373,7 @@ "PackagesVersions": { "default": { "parallelcluster": "2.4.0", - "cookbook": "aws-parallelcluster-cookbook-2.3.2", + "cookbook": "aws-parallelcluster-cookbook-2.4.0", "chef": "14.2.0", "ridley": "5.1.1", "berkshelf": "7.0.4", From e8aed9506fc8ae7ae4e6fe00f1222f6c3af62762 Mon Sep 17 00:00:00 2001 From: Matteo Fiordarancio Date: Fri, 24 May 2019 10:49:55 +0200 Subject: [PATCH 089/121] added a check on validate_vpc_coherence that look to sanity check param --- cli/pcluster/cfnconfig.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cli/pcluster/cfnconfig.py b/cli/pcluster/cfnconfig.py index 7b2e82fb9a..5ed45bd2d0 100644 --- a/cli/pcluster/cfnconfig.py +++ b/cli/pcluster/cfnconfig.py @@ -357,7 +357,8 @@ def __init_vpc_parameters(self): # Check that cidr and public ips are not both set cidr_value = self.__config.get(vpc_section, "compute_subnet_cidr", fallback=None) public_ips = self.__config.getboolean(vpc_section, "use_public_ips", fallback=True) - ResourceValidator.validate_vpc_coherence(cidr_value, public_ips) + if self.__sanity_check: + ResourceValidator.validate_vpc_coherence(cidr_value, public_ips) def __check_account_capacity(self): """Try to launch the requested number of instances to verify Account limits.""" From e97c859b02342a939f18d110dab88ba9acada6e5 Mon Sep 17 00:00:00 2001 From: Yue Jiang Date: Mon, 27 May 2019 19:53:04 -0700 Subject: [PATCH 090/121] fix util path --- docs/custom_cookbook.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/custom_cookbook.rst b/docs/custom_cookbook.rst index 5b62e77f08..d17968e31f 100644 --- a/docs/custom_cookbook.rst +++ b/docs/custom_cookbook.rst @@ -21,7 +21,7 @@ Steps #. Upload the cookbook, changing ``[your_bucket]`` to a bucket you own :: - $ cd aws-parallelcluster-cookbook/utils + $ cd aws-parallelcluster-cookbook $ /bin/bash util/uploadCookbook.sh --bucket [your_bucket] --srcdir . #. From the output above, add the following variable to the AWS ParallelCluster config file, under the ``[cluster ...]`` section :: From f10d37b9b6a1d704fbbc4324afa49930acad467e Mon Sep 17 00:00:00 2001 From: Francesco De Martino Date: Tue, 28 May 2019 16:38:14 +0200 Subject: [PATCH 091/121] Integ tests: sge - test nodewatcher termination on failing nodes Signed-off-by: Francesco De Martino --- .../tests/common/schedulers_common.py | 5 ++++- tests/integration-tests/tests/test_scaling.py | 2 +- .../sge_kill_scheduler_job.sh | 13 +++++++++++++ 3 files changed, 18 insertions(+), 2 deletions(-) create mode 100755 tests/integration-tests/tests/test_scaling/test_nodewatcher_terminates_failing_node/sge_kill_scheduler_job.sh diff --git a/tests/integration-tests/tests/common/schedulers_common.py b/tests/integration-tests/tests/common/schedulers_common.py index 64d577c2fb..950d18f13b 100644 --- a/tests/integration-tests/tests/common/schedulers_common.py +++ b/tests/integration-tests/tests/common/schedulers_common.py @@ -162,7 +162,10 @@ def submit_command(self, command, nodes=1): # noqa: D102 return self._remote_command_executor.run_remote_command("echo '{0}' | qsub".format(command)) def submit_script(self, script, nodes=1): # noqa: D102 - raise NotImplementedError + script_name = os.path.basename(script) + return self._remote_command_executor.run_remote_command( + "qsub {0}".format(script_name), additional_files=[script] + ) def assert_job_succeeded(self, job_id, children_number=0): # noqa: D102 __tracebackhide__ = True diff --git a/tests/integration-tests/tests/test_scaling.py b/tests/integration-tests/tests/test_scaling.py index 7df276d97d..c410ea0721 100644 --- a/tests/integration-tests/tests/test_scaling.py +++ b/tests/integration-tests/tests/test_scaling.py @@ -66,7 +66,7 @@ def test_multiple_jobs_submission(scheduler, region, pcluster_config_reader, clu @pytest.mark.regions(["sa-east-1"]) @pytest.mark.instances(["c5.xlarge"]) -@pytest.mark.schedulers(["slurm"]) +@pytest.mark.schedulers(["slurm", "sge"]) @pytest.mark.usefixtures("region", "os", "instance") @pytest.mark.nodewatcher def test_nodewatcher_terminates_failing_node(scheduler, region, pcluster_config_reader, clusters_factory, test_datadir): diff --git a/tests/integration-tests/tests/test_scaling/test_nodewatcher_terminates_failing_node/sge_kill_scheduler_job.sh b/tests/integration-tests/tests/test_scaling/test_nodewatcher_terminates_failing_node/sge_kill_scheduler_job.sh new file mode 100755 index 0000000000..5d9c076da5 --- /dev/null +++ b/tests/integration-tests/tests/test_scaling/test_nodewatcher_terminates_failing_node/sge_kill_scheduler_job.sh @@ -0,0 +1,13 @@ +#!/bin/bash +# Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). +# You may not use this file except in compliance with the License. +# A copy of the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "LICENSE.txt" file accompanying this file. +# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied. +# See the License for the specific language governing permissions and limitations under the License. +sudo /etc/init.d/sgeexecd.p6444 stop From ee8ce2cdfb35258af51909188abdb06e204d1d85 Mon Sep 17 00:00:00 2001 From: Sean Smith Date: Thu, 23 May 2019 14:45:32 -0700 Subject: [PATCH 092/121] Generalize Instance Whitelist * Preserves old behaviour with `batch_instances.json` ```bash python util/instance-whitelist.py --partition commercial --regions us-east-1 --dryrun true ``` * while adding new behaviour to produce `feature_whitelist.json` ```bash python util/instance-whitelist.py --partition commercial --efa c5n.18xlarge,i3en.24xlarge,p3dn.24xlarge --regions us-east-1 --dryrun true ``` Signed-off-by: Sean Smith --- ...tance-whitelist.py => instance-whitelist.py} | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) rename util/{batch-instance-whitelist.py => instance-whitelist.py} (85%) diff --git a/util/batch-instance-whitelist.py b/util/instance-whitelist.py similarity index 85% rename from util/batch-instance-whitelist.py rename to util/instance-whitelist.py index e4aec9713e..76f183244c 100755 --- a/util/batch-instance-whitelist.py +++ b/util/instance-whitelist.py @@ -47,7 +47,7 @@ def get_all_aws_regions(partition): return set(sorted(r.get("RegionName") for r in ec2.describe_regions().get("Regions"))) - UNSUPPORTED_REGIONS -def get_instance_whitelist(args, region): +def get_batch_instance_whitelist(args, region): # try to create a dummy compute environmment batch_client = boto3.client("batch", region_name=region) @@ -78,12 +78,11 @@ def get_instance_whitelist(args, region): return instances -def upload_to_s3(args, region, instances): +def upload_to_s3(args, region, instances, key): s3_client = boto3.resource("s3", region_name=region) bucket = args.bucket if args.bucket else "%s-aws-parallelcluster" % region - key = "instances/batch_instances.json" if args.dryrun == "true": print(instances) @@ -106,13 +105,18 @@ def upload_to_s3(args, region, instances): def main(args): # For all regions for region in args.regions: - instances = get_instance_whitelist(args, region) - response = upload_to_s3(args, region, instances) + batch_instances = get_batch_instance_whitelist(args, region) + if args.efa: + efa_instances = args.efa.split(",") + instances = {"Features": {"efa": {"instances": efa_instances}, "awsbatch": {"instances": batch_instances}}} + upload_to_s3(args, region, instances, "features/feature_whitelist.json") + else: + upload_to_s3(args, region, batch_instances, "instances/batch_instances.json") if __name__ == "__main__": # parse inputs - parser = argparse.ArgumentParser(description="Generate a whitelist of batch instance types.") + parser = argparse.ArgumentParser(description="Generate a whitelist of instance types per region.") parser.add_argument("--partition", type=str, help="commercial | china | govcloud", required=True) parser.add_argument( "--regions", @@ -123,6 +127,7 @@ def main(args): parser.add_argument( "--bucket", type=str, help="Bucket to upload too, defaults to [region]-aws-parallelcluster", required=False ) + parser.add_argument("--efa", type=str, help="Comma separated list of instances supported by EFA", required=False) parser.add_argument("--dryrun", type=str, help="Doesn't push anything to S3, just outputs", required=True) args = parser.parse_args() From 1bf3785c230757ce3e3407cf3ebfbad730557c11 Mon Sep 17 00:00:00 2001 From: Sean Smith Date: Mon, 6 May 2019 09:45:11 -0700 Subject: [PATCH 093/121] EFA Config flag Signed-off-by: Sean Smith --- cli/pcluster/cfnconfig.py | 1 + cloudformation/aws-parallelcluster.cfn.json | 22 +++++++++++++++++++++ 2 files changed, 23 insertions(+) diff --git a/cli/pcluster/cfnconfig.py b/cli/pcluster/cfnconfig.py index 5ed45bd2d0..74dfe05c57 100644 --- a/cli/pcluster/cfnconfig.py +++ b/cli/pcluster/cfnconfig.py @@ -518,6 +518,7 @@ def __init_cluster_parameters(self): custom_chef_runlist=("CustomChefRunList", None), additional_cfn_template=("AdditionalCfnTemplate", None), custom_awsbatch_template_url=("CustomAWSBatchTemplateURL", None), + enable_efa=("EFA", None), ) for key in cluster_options: try: diff --git a/cloudformation/aws-parallelcluster.cfn.json b/cloudformation/aws-parallelcluster.cfn.json index 1844c3ff18..b4923c7afd 100644 --- a/cloudformation/aws-parallelcluster.cfn.json +++ b/cloudformation/aws-parallelcluster.cfn.json @@ -598,6 +598,11 @@ "Description": "Comma separated list of efs related options, 8 parameters in total, [shared_dir,efs_fs_id,performance_mode,efs_kms_key_id,provisioned_throughput,encrypted,throughput_mode,valid_existing_MTorNot]", "Type": "String", "Default": "NONE, NONE, NONE, NONE, NONE, NONE, NONE, NONE" + }, + "EFA": { + "Description": "Enable EFA on the compute nodes, enable_efa = compute", + "Type": "String", + "Default": "NONE" } }, "Conditions": { @@ -1195,6 +1200,14 @@ ] } ] + }, + "EnableEFA": { + "Fn::Equals": [ + { + "Ref": "EFA" + }, + "compute" + ] } }, "Mappings": { @@ -3415,6 +3428,15 @@ "NetworkInterfaces": [ { "DeviceIndex": 0, + "InterfaceType": { + "Fn::If": [ + "EnableEFA", + "EFA", + { + "Ref": "AWS::NoValue" + } + ] + }, "Groups": [ { "Fn::If": [ From ff40d92b7b64dd44483eadebc83cb6caeb36cbb1 Mon Sep 17 00:00:00 2001 From: Sean Smith Date: Mon, 6 May 2019 12:19:32 -0700 Subject: [PATCH 094/121] Validate EFA Parameters * Validate EFA Parameters Signed-off-by: Sean Smith --- cli/pcluster/cfnconfig.py | 27 +++++++++- cli/pcluster/config_sanity.py | 58 ++++++++++++++++++++- cli/pcluster/utils.py | 36 +++++++++---- cloudformation/aws-parallelcluster.cfn.json | 42 +++++++++++++-- 4 files changed, 145 insertions(+), 18 deletions(-) diff --git a/cli/pcluster/cfnconfig.py b/cli/pcluster/cfnconfig.py index 74dfe05c57..dec7c99e57 100644 --- a/cli/pcluster/cfnconfig.py +++ b/cli/pcluster/cfnconfig.py @@ -104,6 +104,9 @@ def __init__(self, args): # Initialize aliases public attributes self.__init_aliases() + # efa checks + self.__init_efa_parameters() + # Handle extra parameters supplied on command-line try: if self.args.extra_parameters is not None: @@ -518,7 +521,6 @@ def __init_cluster_parameters(self): custom_chef_runlist=("CustomChefRunList", None), additional_cfn_template=("AdditionalCfnTemplate", None), custom_awsbatch_template_url=("CustomAWSBatchTemplateURL", None), - enable_efa=("EFA", None), ) for key in cluster_options: try: @@ -531,6 +533,19 @@ def __init_cluster_parameters(self): except configparser.NoOptionError: pass + def __init_efa_parameters(self): + try: + __temp__ = self.__config.get(self.__cluster_section, "enable_efa") + if __temp__ != "compute": + self.__fail("valid values for enable_efa = compute") + + self.__validate_os("EFA", self.__get_os(), ["alinux", "centos7"]) + self.__validate_scheduler("EFA", self.__get_scheduler(), ["sge", "slurm", "torque"]) + self.__validate_resource("EFA", self.parameters) + self.parameters["EFA"] = __temp__ + except configparser.NoOptionError: + pass + def __init_extra_json_parameter(self): """Check for extra_json = { "cluster" : ... } configuration parameters and map to "cfncluster".""" extra_json = self.parameters.get("ExtraJson") @@ -597,12 +612,22 @@ def __check_option_absent_awsbatch(self, option): if self.__config.has_option(self.__cluster_section, option): self.__fail("option %s cannot be used with awsbatch" % option) + def __get_scheduler(self): + scheduler = "sge" + if self.__config.has_option(self.__cluster_section, "scheduler"): + scheduler = self.__config.get(self.__cluster_section, "scheduler") + return scheduler + def __get_os(self): base_os = "alinux" if self.__config.has_option(self.__cluster_section, "base_os"): base_os = self.__config.get(self.__cluster_section, "base_os") return base_os + def __validate_scheduler(self, service, scheduler, supported_schedulers): + if scheduler not in supported_schedulers: + self.__fail("%s supports following Schedulers: %s" % (service, supported_schedulers)) + def __validate_os(self, service, baseos, supported_oses): if baseos not in supported_oses: self.__fail("%s supports following OSes: %s" % (service, supported_oses)) diff --git a/cli/pcluster/config_sanity.py b/cli/pcluster/config_sanity.py index f31f6f8db9..8d9a6de785 100644 --- a/cli/pcluster/config_sanity.py +++ b/cli/pcluster/config_sanity.py @@ -23,7 +23,7 @@ import boto3 from botocore.exceptions import ClientError -from pcluster.utils import get_instance_vcpus, get_supported_batch_instances +from pcluster.utils import get_instance_vcpus, get_supported_features class ResourceValidator(object): @@ -222,6 +222,57 @@ def __validate_fsx_parameters(self, resource_type, resource_value): if not (1 <= int(resource_value[0]) <= 512000): self.__fail(resource_type, "has a minimum size of 1 MiB, and max size of 512,000 MiB") + def __validate_efa_sg(self, resource_type, sg_id): + try: + ec2 = boto3.client( + "ec2", + region_name=self.region, + aws_access_key_id=self.aws_access_key_id, + aws_secret_access_key=self.aws_secret_access_key, + ) + sg = ec2.describe_security_groups(GroupIds=[sg_id]).get("SecurityGroups")[0] + in_rules = sg.get("IpPermissions") + out_rules = sg.get("IpPermissionsEgress") + + allowed_in = False + allowed_out = False + for rule in in_rules: + # UserIdGroupPairs is always of length 1, so grabbing 0th object is ok + if ( + rule.get("IpProtocol") == "-1" + and len(rule.get("UserIdGroupPairs")) > 0 + and rule.get("UserIdGroupPairs")[0].get("GroupId") == sg_id + ): + allowed_in = True + break + for rule in out_rules: + if ( + rule.get("IpProtocol") == "-1" + and len(rule.get("UserIdGroupPairs")) > 0 + and rule.get("UserIdGroupPairs")[0].get("GroupId") == sg_id + ): + allowed_out = True + break + if not (allowed_in and allowed_out): + self.__fail( + resource_type, + "VPC Security Group %s must allow all traffic in and out from itself. " + "See https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/efa-start.html#efa-start-security" % sg_id, + ) + except ClientError as e: + self.__fail(resource_type, e.response.get("Error").get("Message")) + + def __validate_efa_parameters(self, resource_type, resource_value): + supported_features = get_supported_features(self.region, "efa") + valid_instances = supported_features.get("instances") + if resource_value.get("ComputeInstanceType", None) not in valid_instances: + self.__fail(resource_type, "Compute Instance needs to be one of %s" % valid_instances) + if resource_value.get("PlacementGroup", "NONE") == "NONE": + self.__fail(resource_type, "Placement group is required, set placement_group.") + if "VPCSecurityGroupId" in resource_value: + sg_id = resource_value.get("VPCSecurityGroupId") + self.__validate_efa_sg(resource_type, sg_id) + def validate(self, resource_type, resource_value): # noqa: C901 FIXME """ Validate the given resource. Print an error and exit in case of error. @@ -530,6 +581,9 @@ def validate(self, resource_type, resource_value): # noqa: C901 FIXME # FSX FS Id check elif resource_type in ["fsx_fs_id", "FSx_storage_capacity", "FSx_imported_file_chunk_size", "FSx_export_path"]: self.__validate_fsx_parameters(resource_type, resource_value) + elif resource_type == "EFA": + self.__validate_efa_parameters(resource_type, resource_value) + # Batch Parameters elif resource_type == "AWSBatch_Parameters": # Check region @@ -566,7 +620,7 @@ def validate(self, resource_type, resource_value): # noqa: C901 FIXME if "ComputeInstanceType" in resource_value: compute_instance_type = resource_value["ComputeInstanceType"] try: - supported_instances = get_supported_batch_instances(self.region) + supported_instances = get_supported_features(self.region, "batch").get("instances") if supported_instances: for instance in compute_instance_type.split(","): if not instance.strip() in supported_instances: diff --git a/cli/pcluster/utils.py b/cli/pcluster/utils.py index a49d031c9c..b282a7ec42 100644 --- a/cli/pcluster/utils.py +++ b/cli/pcluster/utils.py @@ -131,21 +131,37 @@ def _get_json_from_s3(region, file_name): return json.loads(file_contents) -def get_supported_batch_instances(region): +def get_supported_features(region, feature): """ - Get a json object containing the instances supported by batch. + Get a json object containing the attributes supported by a feature, for example. + + { + "Features": { + "efa": { + "instances": ["c5n.18xlarge", "p3dn.24xlarge", "i3en.24xlarge"], + "baseos": ["alinux", "centos7"], + "schedulers": ["sge", "slurm", "torque"] + }, + "awsbatch": { + "instances": ["r3.8xlarge", ..., "m5.4xlarge"] + } + } + } :param region: AWS Region - :param instance_type: the instance type to search for. - :return: json object containing the instances supported by batch - or an empty object if unable to parse/get the instance list file + :param feature: the feature to search for, i.e. "efa" "awsbatch" + :return: json object containing all the attributes supported by feature """ try: - instances = _get_json_from_s3(region, "instances/batch_instances.json") - except (ValueError, ClientError): - instances = "" - - return instances + features = _get_json_from_s3(region, "features/feature_whitelist.json") + supported_features = features.get("Features").get(feature) + except (ValueError, ClientError, KeyError): + print( + "Failed validate %s. This is probably a bug on our end. Please set sanity_check = false and retry" % feature + ) + exit(1) + + return supported_features def get_instance_vcpus(region, instance_type): diff --git a/cloudformation/aws-parallelcluster.cfn.json b/cloudformation/aws-parallelcluster.cfn.json index b4923c7afd..c99e47bf53 100644 --- a/cloudformation/aws-parallelcluster.cfn.json +++ b/cloudformation/aws-parallelcluster.cfn.json @@ -1202,11 +1202,15 @@ ] }, "EnableEFA": { - "Fn::Equals": [ + "Fn::Not": [ { - "Ref": "EFA" - }, - "compute" + "Fn::Equals": [ + { + "Ref": "EFA" + }, + "NONE" + ] + } ] } }, @@ -3431,7 +3435,7 @@ "InterfaceType": { "Fn::If": [ "EnableEFA", - "EFA", + "efa", { "Ref": "AWS::NoValue" } @@ -4225,6 +4229,34 @@ }, "Condition": "CreateSecurityGroups" }, + "ComputeSecurityGroupEgress": { + "Type": "AWS::EC2::SecurityGroupEgress", + "Properties": { + "IpProtocol": "-1", + "FromPort": 0, + "ToPort": 65535, + "DestinationSecurityGroupId": { + "Ref": "ComputeSecurityGroup" + }, + "GroupId": { + "Ref": "ComputeSecurityGroup" + } + }, + "Condition": "CreateSecurityGroups" + }, + "ComputeSecurityGroupNormalEgress": { + "Type": "AWS::EC2::SecurityGroupEgress", + "Properties": { + "IpProtocol": "-1", + "FromPort": 0, + "ToPort": 65535, + "CidrIp": "0.0.0.0/0", + "GroupId": { + "Ref": "ComputeSecurityGroup" + } + }, + "Condition": "CreateSecurityGroups" + }, "ComputeSecurityGroupIngress": { "Type": "AWS::EC2::SecurityGroupIngress", "Properties": { From d4ff28a5707041d7f0715af4c22f84d101a4f0f7 Mon Sep 17 00:00:00 2001 From: Francesco De Martino Date: Wed, 29 May 2019 19:09:40 +0200 Subject: [PATCH 095/121] integ tests: implement sge tests for pending jobs and invalid jobs Signed-off-by: Francesco De Martino --- .../tests/cfn-init/test_cfn_init.py | 2 +- .../tests/common/assertions.py | 31 +++++ .../tests/common/scaling_common.py | 13 -- .../tests/common/schedulers_common.py | 14 +- .../tests/schedulers/test_sge.py | 129 ++++++++++++++++++ .../test_sge/test_sge/pcluster.config.ini | 24 ++++ .../tests/schedulers/test_slurm.py | 18 +-- tests/integration-tests/tests/test_scaling.py | 7 +- 8 files changed, 207 insertions(+), 31 deletions(-) create mode 100644 tests/integration-tests/tests/common/assertions.py create mode 100644 tests/integration-tests/tests/schedulers/test_sge.py create mode 100644 tests/integration-tests/tests/schedulers/test_sge/test_sge/pcluster.config.ini diff --git a/tests/integration-tests/tests/cfn-init/test_cfn_init.py b/tests/integration-tests/tests/cfn-init/test_cfn_init.py index da5c587951..a96236297d 100644 --- a/tests/integration-tests/tests/cfn-init/test_cfn_init.py +++ b/tests/integration-tests/tests/cfn-init/test_cfn_init.py @@ -15,8 +15,8 @@ from assertpy import assert_that from remote_command_executor import RemoteCommandExecutor +from tests.common.assertions import assert_instance_replaced_or_terminating from tests.common.compute_logs_common import wait_compute_log -from tests.common.scaling_common import assert_instance_replaced_or_terminating from tests.common.schedulers_common import SlurmCommands diff --git a/tests/integration-tests/tests/common/assertions.py b/tests/integration-tests/tests/common/assertions.py new file mode 100644 index 0000000000..5bc557ea2a --- /dev/null +++ b/tests/integration-tests/tests/common/assertions.py @@ -0,0 +1,31 @@ +# Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance +# with the License. A copy of the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES +# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and +# limitations under the License. +import boto3 + +from assertpy import assert_that + + +def assert_instance_replaced_or_terminating(instance_id, region): + """Assert that a given instance got replaced or is marked as Unhealthy.""" + response = boto3.client("autoscaling", region_name=region).describe_auto_scaling_instances( + InstanceIds=[instance_id] + ) + assert_that( + not response["AutoScalingInstances"] + or response["AutoScalingInstances"][0]["LifecycleState"] == "Terminating" + or response["AutoScalingInstances"][0]["HealthStatus"] == "UNHEALTHY" + ).is_true() + + +def assert_asg_desired_capacity(region, asg_name, expected): + asg_client = boto3.client("autoscaling", region_name=region) + asg = asg_client.describe_auto_scaling_groups(AutoScalingGroupNames=[asg_name]).get("AutoScalingGroups")[0] + assert_that(asg.get("DesiredCapacity")).is_equal_to(expected) diff --git a/tests/integration-tests/tests/common/scaling_common.py b/tests/integration-tests/tests/common/scaling_common.py index ce80cc3616..0ad64467d3 100644 --- a/tests/integration-tests/tests/common/scaling_common.py +++ b/tests/integration-tests/tests/common/scaling_common.py @@ -14,7 +14,6 @@ import boto3 from retrying import RetryError, retry -from assertpy import assert_that from time_utils import seconds @@ -122,15 +121,3 @@ def get_desired_asg_capacity(region, stack_name): def get_max_asg_capacity(region, stack_name): """Retrieve the max capacity of the autoscaling group for a specific cluster.""" return _get_asg(region, stack_name)["MaxSize"] - - -def assert_instance_replaced_or_terminating(instance_id, region): - """Assert that a given instance got replaced or is marked as Unhealthy.""" - response = boto3.client("autoscaling", region_name=region).describe_auto_scaling_instances( - InstanceIds=[instance_id] - ) - assert_that( - not response["AutoScalingInstances"] - or response["AutoScalingInstances"][0]["LifecycleState"] == "Terminating" - or response["AutoScalingInstances"][0]["HealthStatus"] == "UNHEALTHY" - ).is_true() diff --git a/tests/integration-tests/tests/common/schedulers_common.py b/tests/integration-tests/tests/common/schedulers_common.py index 950d18f13b..7428bf5380 100644 --- a/tests/integration-tests/tests/common/schedulers_common.py +++ b/tests/integration-tests/tests/common/schedulers_common.py @@ -157,9 +157,17 @@ def assert_job_submitted(self, qsub_output): # noqa: D102 assert_that(match).is_not_none() return match.group(1) - def submit_command(self, command, nodes=1): # noqa: D102 - # TODO add support for multiple nodes - return self._remote_command_executor.run_remote_command("echo '{0}' | qsub".format(command)) + def submit_command(self, command, nodes=1, slots=None, hold=False): # noqa: D102 + flags = "" + if nodes != 1: + raise Exception("SGE does not support nodes option") + if slots: + flags += "-pe mpi {0} ".format(slots) + if hold: + flags += "-h " + return self._remote_command_executor.run_remote_command( + "echo '{0}' | qsub {1}".format(command, flags), raise_on_error=False + ) def submit_script(self, script, nodes=1): # noqa: D102 script_name = os.path.basename(script) diff --git a/tests/integration-tests/tests/schedulers/test_sge.py b/tests/integration-tests/tests/schedulers/test_sge.py new file mode 100644 index 0000000000..59e5bfd807 --- /dev/null +++ b/tests/integration-tests/tests/schedulers/test_sge.py @@ -0,0 +1,129 @@ +# Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). +# You may not use this file except in compliance with the License. +# A copy of the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "LICENSE.txt" file accompanying this file. +# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied. +# See the License for the specific language governing permissions and limitations under the License. +import logging +import re +import time + +import pytest + +from assertpy import assert_that +from remote_command_executor import RemoteCommandExecutor +from tests.common.assertions import assert_asg_desired_capacity +from tests.common.scaling_common import get_compute_nodes_allocation +from tests.common.schedulers_common import SgeCommands +from time_utils import minutes + + +@pytest.mark.regions(["ap-southeast-1"]) +@pytest.mark.instances(["c5.xlarge"]) +@pytest.mark.schedulers(["sge"]) +@pytest.mark.usefixtures("os", "instance", "scheduler") +def test_sge(region, pcluster_config_reader, clusters_factory): + """ + Test all AWS SGE related features. + + Grouped all tests in a single function so that cluster can be reused for all of them. + """ + scaledown_idletime = 3 + max_queue_size = 5 + max_slots = 4 + cluster_config = pcluster_config_reader(scaledown_idletime=scaledown_idletime, max_queue_size=max_queue_size) + cluster = clusters_factory(cluster_config) + remote_command_executor = RemoteCommandExecutor(cluster) + + _test_sge_version(remote_command_executor) + _test_non_runnable_jobs(remote_command_executor, max_queue_size, max_slots, region, cluster, scaledown_idletime) + _test_job_dependencies(remote_command_executor, region, cluster.cfn_name, scaledown_idletime) + + +def _test_sge_version(remote_command_executor): + logging.info("Testing SGE Version") + version = remote_command_executor.run_remote_command("qstat -help | head -n 1").stdout + assert_that(version).is_equal_to("SGE 8.1.9") + + +def _test_non_runnable_jobs(remote_command_executor, max_queue_size, max_slots, region, cluster, scaledown_idletime): + sge_commands = SgeCommands(remote_command_executor) + + # Make sure the cluster has at least 1 node in the queue so that we can verify cluster scales down correctly + if sge_commands.compute_nodes_count() == 0: + result = sge_commands.submit_command("sleep 1") + job_id = sge_commands.assert_job_submitted(result.stdout) + sge_commands.wait_job_completed(job_id) + assert_that(sge_commands.compute_nodes_count()).is_greater_than(0) + + logging.info("Testing cluster doesn't scale when job requires a capacity that is higher than the max available") + result = sge_commands.submit_command("sleep 1000", slots=(max_slots * max_queue_size) + 1) + max_slots_job_id = sge_commands.assert_job_submitted(result.stdout) + assert_that(_get_job_state(remote_command_executor, max_slots_job_id)).is_equal_to("qw") + + logging.info("Testing cluster doesn't scale when job is set on hold") + result = sge_commands.submit_command("sleep 1000", hold=True) + hold_job_id = sge_commands.assert_job_submitted(result.stdout) + assert_that(_get_job_state(remote_command_executor, hold_job_id)).is_equal_to("hqw") + + logging.info("Testing cluster scales down when pending jobs cannot be submitted") + _, compute_nodes_time_series, _ = get_compute_nodes_allocation( + scheduler_commands=sge_commands, + region=region, + stack_name=cluster.cfn_name, + max_monitoring_time=minutes(scaledown_idletime) + minutes(5), + ) + assert_that(compute_nodes_time_series[-1]).is_equal_to(0) + + # Check we are not scaling up again + time.sleep(60) + assert_asg_desired_capacity(region, cluster.asg, expected=0) + pending_jobs = remote_command_executor.run_remote_command("qstat -s p | tail -n +3 | awk '{ print $1 }'").stdout + pending_jobs = pending_jobs.splitlines() + assert_that(pending_jobs).contains(max_slots_job_id, hold_job_id) + + +def _test_job_dependencies(remote_command_executor, region, stack_name, scaledown_idletime): + logging.info("Testing cluster doesn't scale when job dependencies are not satisfied") + sge_commands = SgeCommands(remote_command_executor) + result = sge_commands.submit_command("sleep 60", nodes=1) + job_id = sge_commands.assert_job_submitted(result.stdout) + result = remote_command_executor.run_remote_command( + "echo 'sleep 1' | qsub -hold_jid {0}".format(job_id), raise_on_error=False + ) + dependent_job_id = sge_commands.assert_job_submitted(result.stdout) + + assert_that(_get_job_state(remote_command_executor, dependent_job_id)).is_equal_to("hqw") + + # Assert scaling worked as expected + jobs_execution_time = 1 + estimated_scaleup_time = 5 + max_scaledown_time = 10 + asg_capacity_time_series, compute_nodes_time_series, timestamps = get_compute_nodes_allocation( + scheduler_commands=sge_commands, + region=region, + stack_name=stack_name, + max_monitoring_time=minutes(jobs_execution_time) + + minutes(scaledown_idletime) + + minutes(estimated_scaleup_time) + + minutes(max_scaledown_time), + ) + assert_that(max(asg_capacity_time_series)).is_equal_to(1) + assert_that(max(compute_nodes_time_series)).is_equal_to(1) + assert_that(asg_capacity_time_series[-1]).is_equal_to(0) + assert_that(compute_nodes_time_series[-1]).is_equal_to(0) + # Assert jobs were completed + sge_commands.assert_job_succeeded(job_id) + sge_commands.assert_job_succeeded(dependent_job_id) + + +def _get_job_state(remote_command_executor, job_id): + pending_jobs = remote_command_executor.run_remote_command("qstat | tail -n +3 | awk '{ print $1,$5 }'").stdout + match = re.search(r"{0} (\w+)".format(job_id), pending_jobs) + assert_that(match).is_not_none() + return match.group(1) diff --git a/tests/integration-tests/tests/schedulers/test_sge/test_sge/pcluster.config.ini b/tests/integration-tests/tests/schedulers/test_sge/test_sge/pcluster.config.ini new file mode 100644 index 0000000000..c443684556 --- /dev/null +++ b/tests/integration-tests/tests/schedulers/test_sge/test_sge/pcluster.config.ini @@ -0,0 +1,24 @@ +[global] +cluster_template = default + +[aws] +aws_region_name = {{ region }} + +[cluster default] +base_os = {{ os }} +key_name = {{ key_name }} +vpc_settings = parallelcluster-vpc +scheduler = sge +master_instance_type = {{ instance }} +compute_instance_type = {{ instance }} +initial_queue_size = 0 +max_queue_size = {{ max_queue_size }} +maintain_initial_size = false +scaling_settings = custom + +[scaling custom] +scaledown_idletime = {{ scaledown_idletime }} + +[vpc parallelcluster-vpc] +vpc_id = {{ vpc_id }} +master_subnet_id = {{ public_subnet_id }} diff --git a/tests/integration-tests/tests/schedulers/test_slurm.py b/tests/integration-tests/tests/schedulers/test_slurm.py index d0538e7de9..8fe316e1c1 100644 --- a/tests/integration-tests/tests/schedulers/test_slurm.py +++ b/tests/integration-tests/tests/schedulers/test_slurm.py @@ -17,6 +17,7 @@ from assertpy import assert_that from remote_command_executor import RemoteCommandExecutor +from tests.common.assertions import assert_asg_desired_capacity from tests.common.scaling_common import get_compute_nodes_allocation from tests.common.schedulers_common import SlurmCommands from time_utils import minutes @@ -114,20 +115,25 @@ def _test_job_dependencies(remote_command_executor, region, stack_name, scaledow assert_that(max(compute_nodes_time_series)).is_equal_to(1) assert_that(asg_capacity_time_series[-1]).is_equal_to(0) assert_that(compute_nodes_time_series[-1]).is_equal_to(0) + # Assert scheduler configuration is correct _assert_dummy_nodes(remote_command_executor, max_queue_size) assert_that(_retrieve_slurm_nodes_from_config(remote_command_executor)).is_empty() + # Assert jobs were completed + slurm_commands.assert_job_succeeded(job_id) + slurm_commands.assert_job_succeeded(dependent_job_id) def _test_cluster_limits(remote_command_executor, max_queue_size, region, asg_name): logging.info("Testing cluster doesn't scale when job requires a capacity that is higher than the max available") slurm_commands = SlurmCommands(remote_command_executor) - result = slurm_commands.submit_command("sleep 1", nodes=max_queue_size + 1) + result = slurm_commands.submit_command("sleep 1000", nodes=max_queue_size + 1) max_nodes_job_id = slurm_commands.assert_job_submitted(result.stdout) result = remote_command_executor.run_remote_command("sbatch -N 1 --wrap='sleep 1' --cpus-per-task 5") max_cpu_job_id = slurm_commands.assert_job_submitted(result.stdout) - # Wait for reason to be computed - time.sleep(3) + # Check we are not scaling + time.sleep(60) + assert_asg_desired_capacity(region, asg_name, expected=0) assert_that(_get_job_info(remote_command_executor, max_nodes_job_id)).contains( "JobState=PENDING Reason=PartitionNodeLimit" ) @@ -136,12 +142,6 @@ def _test_cluster_limits(remote_command_executor, max_queue_size, region, asg_na "_or_reserved_for_jobs_in_higher_priority_partitions" ) - # Check we are not scaling - time.sleep(60) - asg_client = boto3.client("autoscaling", region_name=region) - asg = asg_client.describe_auto_scaling_groups(AutoScalingGroupNames=[asg_name]).get("AutoScalingGroups")[0] - assert_that(asg.get("DesiredCapacity")).is_equal_to(0) - def _retrieve_slurm_dummy_nodes_from_config(remote_command_executor): retrieve_dummy_nodes_command = "sudo cat /opt/slurm/etc/slurm_parallelcluster_nodes.conf | head -n 1" diff --git a/tests/integration-tests/tests/test_scaling.py b/tests/integration-tests/tests/test_scaling.py index c410ea0721..db94f75f5b 100644 --- a/tests/integration-tests/tests/test_scaling.py +++ b/tests/integration-tests/tests/test_scaling.py @@ -15,12 +15,9 @@ from assertpy import assert_that from remote_command_executor import RemoteCommandExecutionError, RemoteCommandExecutor +from tests.common.assertions import assert_instance_replaced_or_terminating from tests.common.compute_logs_common import wait_compute_log -from tests.common.scaling_common import ( - assert_instance_replaced_or_terminating, - get_compute_nodes_allocation, - get_desired_asg_capacity, -) +from tests.common.scaling_common import get_compute_nodes_allocation, get_desired_asg_capacity from tests.common.schedulers_common import get_scheduler_commands from time_utils import minutes From 2346e3b894563f91739a313073976f492d957ba4 Mon Sep 17 00:00:00 2001 From: Sean Smith Date: Fri, 24 May 2019 13:37:19 -0700 Subject: [PATCH 096/121] Clarify wording on -t flag --- cli/pcluster/cli.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cli/pcluster/cli.py b/cli/pcluster/cli.py index 20438ba04e..dd2727b913 100644 --- a/cli/pcluster/cli.py +++ b/cli/pcluster/cli.py @@ -157,7 +157,7 @@ def _get_parser(): "--template-url", help="Specifies the URL for a custom CloudFormation template, " "if it was used at creation time.", ) - pcreate.add_argument("-t", "--cluster-template", help="Indicates which cluster template to use.") + pcreate.add_argument("-t", "--cluster-template", help="Indicates which section of the cluster template to use.") pcreate.add_argument("-p", "--extra-parameters", type=json.loads, help="Adds extra parameters to the stack create.") pcreate.add_argument("-g", "--tags", type=json.loads, help="Specifies additional tags to be added to the stack.") pcreate.set_defaults(func=create) @@ -181,7 +181,7 @@ def _get_parser(): default=False, help="Disable CloudFormation stack rollback on error.", ) - pupdate.add_argument("-t", "--cluster-template", help="Indicates which cluster template to use.") + pupdate.add_argument("-t", "--cluster-template", help="Indicates which section of the cluster template to use.") pupdate.add_argument("-p", "--extra-parameters", help="Adds extra parameters to the stack update.") pupdate.add_argument( "-rd", From 11985943eab85104467353f23727c8dfb378bf86 Mon Sep 17 00:00:00 2001 From: Sean Smith Date: Thu, 30 May 2019 10:01:47 -0700 Subject: [PATCH 097/121] Add enable_efa to dna.json Signed-off-by: Sean Smith --- cli/pcluster/cfnconfig.py | 10 +++++++++- cli/pcluster/config_sanity.py | 4 ---- cloudformation/aws-parallelcluster.cfn.json | 9 +++++++++ 3 files changed, 18 insertions(+), 5 deletions(-) diff --git a/cli/pcluster/cfnconfig.py b/cli/pcluster/cfnconfig.py index dec7c99e57..104f1f7d5e 100644 --- a/cli/pcluster/cfnconfig.py +++ b/cli/pcluster/cfnconfig.py @@ -30,7 +30,7 @@ from botocore.exceptions import ClientError from pcluster.config_sanity import ResourceValidator -from pcluster.utils import get_instance_vcpus +from pcluster.utils import get_instance_vcpus, get_supported_features class ParallelClusterConfig(object): @@ -539,6 +539,10 @@ def __init_efa_parameters(self): if __temp__ != "compute": self.__fail("valid values for enable_efa = compute") + supported_features = get_supported_features(self.region, "efa") + valid_instances = supported_features.get("instances") + + self.__validate_instance("EFA", self.parameters.get("ComputeInstanceType"), valid_instances) self.__validate_os("EFA", self.__get_os(), ["alinux", "centos7"]) self.__validate_scheduler("EFA", self.__get_scheduler(), ["sge", "slurm", "torque"]) self.__validate_resource("EFA", self.parameters) @@ -624,6 +628,10 @@ def __get_os(self): base_os = self.__config.get(self.__cluster_section, "base_os") return base_os + def __validate_instance(self, service, instance, valid_instances): + if instance not in valid_instances: + self.__fail("%s can only be used with the following instances: %s" % (service, valid_instances)) + def __validate_scheduler(self, service, scheduler, supported_schedulers): if scheduler not in supported_schedulers: self.__fail("%s supports following Schedulers: %s" % (service, supported_schedulers)) diff --git a/cli/pcluster/config_sanity.py b/cli/pcluster/config_sanity.py index 8d9a6de785..f58571a42f 100644 --- a/cli/pcluster/config_sanity.py +++ b/cli/pcluster/config_sanity.py @@ -263,10 +263,6 @@ def __validate_efa_sg(self, resource_type, sg_id): self.__fail(resource_type, e.response.get("Error").get("Message")) def __validate_efa_parameters(self, resource_type, resource_value): - supported_features = get_supported_features(self.region, "efa") - valid_instances = supported_features.get("instances") - if resource_value.get("ComputeInstanceType", None) not in valid_instances: - self.__fail(resource_type, "Compute Instance needs to be one of %s" % valid_instances) if resource_value.get("PlacementGroup", "NONE") == "NONE": self.__fail(resource_type, "Placement group is required, set placement_group.") if "VPCSecurityGroupId" in resource_value: diff --git a/cloudformation/aws-parallelcluster.cfn.json b/cloudformation/aws-parallelcluster.cfn.json index c99e47bf53..4b3a8f6b4a 100644 --- a/cloudformation/aws-parallelcluster.cfn.json +++ b/cloudformation/aws-parallelcluster.cfn.json @@ -2404,6 +2404,9 @@ "stack_name": { "Ref": "AWS::StackName" }, + "enable_efa": { + "Ref": "EFA" + }, "cfn_raid_vol_ids": { "Fn::If": [ "CreateRAIDSubstack", @@ -3242,6 +3245,9 @@ "stack_name": { "Ref": "AWS::StackName" }, + "enable_efa": { + "Ref": "EFA" + }, "cfn_raid_parameters": { "Ref": "RAIDOptions" }, @@ -3924,6 +3930,9 @@ "stack_name": { "Ref": "AWS::StackName" }, + "enable_efa": { + "Ref": "EFA" + }, "cfn_raid_parameters": { "Ref": "RAIDOptions" }, From b121f1844ed3368516c24df0fee43d455d58c028 Mon Sep 17 00:00:00 2001 From: Francesco De Martino Date: Fri, 31 May 2019 19:00:26 +0200 Subject: [PATCH 098/121] integ tests: check nodes are correctly removed from scheduler on faulty node termination with running job Signed-off-by: Francesco De Martino --- .../tests/common/assertions.py | 8 ++++++ tests/integration-tests/tests/test_scaling.py | 25 +++++++++++-------- .../sge_kill_scheduler_job.sh | 2 ++ .../slurm_kill_scheduler_job.sh | 2 ++ 4 files changed, 26 insertions(+), 11 deletions(-) diff --git a/tests/integration-tests/tests/common/assertions.py b/tests/integration-tests/tests/common/assertions.py index 5bc557ea2a..656e37431d 100644 --- a/tests/integration-tests/tests/common/assertions.py +++ b/tests/integration-tests/tests/common/assertions.py @@ -29,3 +29,11 @@ def assert_asg_desired_capacity(region, asg_name, expected): asg_client = boto3.client("autoscaling", region_name=region) asg = asg_client.describe_auto_scaling_groups(AutoScalingGroupNames=[asg_name]).get("AutoScalingGroups")[0] assert_that(asg.get("DesiredCapacity")).is_equal_to(expected) + + +def assert_no_errors_in_logs(remote_command_executor, log_files): + __tracebackhide__ = True + for log_file in log_files: + log = remote_command_executor.run_remote_command("cat {0}".format(log_file), hide=True).stdout + for error_level in ["CRITICAL", "ERROR"]: + assert_that(log).does_not_contain(error_level) diff --git a/tests/integration-tests/tests/test_scaling.py b/tests/integration-tests/tests/test_scaling.py index db94f75f5b..4cd09d62bf 100644 --- a/tests/integration-tests/tests/test_scaling.py +++ b/tests/integration-tests/tests/test_scaling.py @@ -12,14 +12,15 @@ import logging import pytest +from retrying import retry from assertpy import assert_that from remote_command_executor import RemoteCommandExecutionError, RemoteCommandExecutor -from tests.common.assertions import assert_instance_replaced_or_terminating +from tests.common.assertions import assert_instance_replaced_or_terminating, assert_no_errors_in_logs from tests.common.compute_logs_common import wait_compute_log from tests.common.scaling_common import get_compute_nodes_allocation, get_desired_asg_capacity from tests.common.schedulers_common import get_scheduler_commands -from time_utils import minutes +from time_utils import minutes, seconds @pytest.mark.skip_schedulers(["awsbatch"]) @@ -58,7 +59,7 @@ def test_multiple_jobs_submission(scheduler, region, pcluster_config_reader, clu ) logging.info("Verifying no error in logs") - _assert_no_errors_in_logs(remote_command_executor, ["/var/log/sqswatcher", "/var/log/jobwatcher"]) + assert_no_errors_in_logs(remote_command_executor, ["/var/log/sqswatcher", "/var/log/jobwatcher"]) @pytest.mark.regions(["sa-east-1"]) @@ -72,6 +73,8 @@ def test_nodewatcher_terminates_failing_node(scheduler, region, pcluster_config_ remote_command_executor = RemoteCommandExecutor(cluster) scheduler_commands = get_scheduler_commands(scheduler, remote_command_executor) + compute_nodes = scheduler_commands.get_compute_nodes() + # submit a job that kills the slurm daemon so that the node enters a failing state scheduler_commands.submit_script(str(test_datadir / "{0}_kill_scheduler_job.sh".format(scheduler))) instance_id = wait_compute_log(remote_command_executor) @@ -80,6 +83,14 @@ def test_nodewatcher_terminates_failing_node(scheduler, region, pcluster_config_ assert_instance_replaced_or_terminating(instance_id, region) # verify that desired capacity is still 1 assert_that(get_desired_asg_capacity(region, cluster.cfn_name)).is_equal_to(1) + _assert_nodes_removed_from_scheduler(scheduler_commands, compute_nodes) + + assert_no_errors_in_logs(remote_command_executor, ["/var/log/sqswatcher", "/var/log/jobwatcher"]) + + +@retry(wait_fixed=seconds(20), stop_max_delay=minutes(5)) +def _assert_nodes_removed_from_scheduler(scheduler_commands, nodes): + assert_that(scheduler_commands.get_compute_nodes()).does_not_contain(*nodes) def _assert_compute_logs(remote_command_executor, instance_id): @@ -153,11 +164,3 @@ def _assert_test_jobs_completed(remote_command_executor, max_jobs_exec_time): jobs_execution_time = jobs_completion_time - jobs_start_time logging.info("Test jobs completed in %d seconds", jobs_execution_time) assert_that(jobs_execution_time).is_less_than(max_jobs_exec_time) - - -def _assert_no_errors_in_logs(remote_command_executor, log_files): - __tracebackhide__ = True - for log_file in log_files: - log = remote_command_executor.run_remote_command("cat {0}".format(log_file), hide=True).stdout - for error_level in ["CRITICAL", "ERROR"]: - assert_that(log).does_not_contain(error_level) diff --git a/tests/integration-tests/tests/test_scaling/test_nodewatcher_terminates_failing_node/sge_kill_scheduler_job.sh b/tests/integration-tests/tests/test_scaling/test_nodewatcher_terminates_failing_node/sge_kill_scheduler_job.sh index 5d9c076da5..1a65342d6c 100755 --- a/tests/integration-tests/tests/test_scaling/test_nodewatcher_terminates_failing_node/sge_kill_scheduler_job.sh +++ b/tests/integration-tests/tests/test_scaling/test_nodewatcher_terminates_failing_node/sge_kill_scheduler_job.sh @@ -11,3 +11,5 @@ # This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied. # See the License for the specific language governing permissions and limitations under the License. sudo /etc/init.d/sgeexecd.p6444 stop +# keep job up and running +sleep infinity diff --git a/tests/integration-tests/tests/test_scaling/test_nodewatcher_terminates_failing_node/slurm_kill_scheduler_job.sh b/tests/integration-tests/tests/test_scaling/test_nodewatcher_terminates_failing_node/slurm_kill_scheduler_job.sh index b0676573e4..da42d09253 100755 --- a/tests/integration-tests/tests/test_scaling/test_nodewatcher_terminates_failing_node/slurm_kill_scheduler_job.sh +++ b/tests/integration-tests/tests/test_scaling/test_nodewatcher_terminates_failing_node/slurm_kill_scheduler_job.sh @@ -11,3 +11,5 @@ # This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied. # See the License for the specific language governing permissions and limitations under the License. sudo kill $(ps aux | grep '[s]lurm' | awk '{print $2}') +# keep job up and running +sleep infinity From ba241bfb0b6725a9b4aca4bfee79708ea49e6ee2 Mon Sep 17 00:00:00 2001 From: Francesco De Martino Date: Fri, 31 May 2019 19:02:10 +0200 Subject: [PATCH 099/121] integ tests: add array and parallel jobs tests to slurm and sge shedulers Signed-off-by: Francesco De Martino --- .../tests/common/assertions.py | 21 +++++++ .../tests/common/schedulers_common.py | 11 +++- .../tests/schedulers/test_sge.py | 56 +++++++++---------- .../test_sge/test_sge/pcluster.config.ini | 2 + .../tests/schedulers/test_slurm.py | 56 +++++++++++-------- .../test_slurm/test_slurm/pcluster.config.ini | 2 + 6 files changed, 92 insertions(+), 56 deletions(-) diff --git a/tests/integration-tests/tests/common/assertions.py b/tests/integration-tests/tests/common/assertions.py index 656e37431d..7074da1f2d 100644 --- a/tests/integration-tests/tests/common/assertions.py +++ b/tests/integration-tests/tests/common/assertions.py @@ -11,6 +11,8 @@ import boto3 from assertpy import assert_that +from tests.common.scaling_common import get_compute_nodes_allocation +from time_utils import minutes def assert_instance_replaced_or_terminating(instance_id, region): @@ -37,3 +39,22 @@ def assert_no_errors_in_logs(remote_command_executor, log_files): log = remote_command_executor.run_remote_command("cat {0}".format(log_file), hide=True).stdout for error_level in ["CRITICAL", "ERROR"]: assert_that(log).does_not_contain(error_level) + + +def assert_scaling_worked(scheduler_commands, region, stack_name, scaledown_idletime, expected_max, expected_final): + jobs_execution_time = 1 + estimated_scaleup_time = 5 + max_scaledown_time = 10 + asg_capacity_time_series, compute_nodes_time_series, _ = get_compute_nodes_allocation( + scheduler_commands=scheduler_commands, + region=region, + stack_name=stack_name, + max_monitoring_time=minutes(jobs_execution_time) + + minutes(scaledown_idletime) + + minutes(estimated_scaleup_time) + + minutes(max_scaledown_time), + ) + assert_that(max(asg_capacity_time_series)).is_equal_to(expected_max) + assert_that(max(compute_nodes_time_series)).is_equal_to(expected_max) + assert_that(asg_capacity_time_series[-1]).is_equal_to(expected_final) + assert_that(compute_nodes_time_series[-1]).is_equal_to(expected_final) diff --git a/tests/integration-tests/tests/common/schedulers_common.py b/tests/integration-tests/tests/common/schedulers_common.py index 7428bf5380..fbf49a0f00 100644 --- a/tests/integration-tests/tests/common/schedulers_common.py +++ b/tests/integration-tests/tests/common/schedulers_common.py @@ -151,9 +151,13 @@ def get_job_exit_status(self, job_id): # noqa: D102 assert_that(match).is_not_none() return match.group(1) - def assert_job_submitted(self, qsub_output): # noqa: D102 + def assert_job_submitted(self, qsub_output, is_array=False): # noqa: D102 __tracebackhide__ = True - match = re.search(r"Your job ([0-9]+) \(.+\) has been submitted", qsub_output) + if is_array: + regex = r"Your job-array ([0-9]+)\.[0-9\-:]+ \(.+\) has been submitted" + else: + regex = r"Your job ([0-9]+) \(.+\) has been submitted" + match = re.search(regex, qsub_output) assert_that(match).is_not_none() return match.group(1) @@ -186,7 +190,8 @@ def compute_nodes_count(self): # noqa: D102 return int(result.stdout.split()[-1]) def get_compute_nodes(self): # noqa: D102 - raise NotImplementedError + result = self._remote_command_executor.run_remote_command("qhost | grep ip- | awk '{print $1}'") + return result.stdout.splitlines() class SlurmCommands(SchedulerCommands): diff --git a/tests/integration-tests/tests/schedulers/test_sge.py b/tests/integration-tests/tests/schedulers/test_sge.py index 59e5bfd807..c2b67556cb 100644 --- a/tests/integration-tests/tests/schedulers/test_sge.py +++ b/tests/integration-tests/tests/schedulers/test_sge.py @@ -11,16 +11,13 @@ # See the License for the specific language governing permissions and limitations under the License. import logging import re -import time import pytest from assertpy import assert_that from remote_command_executor import RemoteCommandExecutor -from tests.common.assertions import assert_asg_desired_capacity -from tests.common.scaling_common import get_compute_nodes_allocation +from tests.common.assertions import assert_no_errors_in_logs, assert_scaling_worked from tests.common.schedulers_common import SgeCommands -from time_utils import minutes @pytest.mark.regions(["ap-southeast-1"]) @@ -43,6 +40,9 @@ def test_sge(region, pcluster_config_reader, clusters_factory): _test_sge_version(remote_command_executor) _test_non_runnable_jobs(remote_command_executor, max_queue_size, max_slots, region, cluster, scaledown_idletime) _test_job_dependencies(remote_command_executor, region, cluster.cfn_name, scaledown_idletime) + _test_job_arrays_and_parallel_jobs(remote_command_executor, region, cluster.cfn_name, scaledown_idletime) + + assert_no_errors_in_logs(remote_command_executor, ["/var/log/sqswatcher", "/var/log/jobwatcher"]) def _test_sge_version(remote_command_executor): @@ -52,6 +52,7 @@ def _test_sge_version(remote_command_executor): def _test_non_runnable_jobs(remote_command_executor, max_queue_size, max_slots, region, cluster, scaledown_idletime): + logging.info("Testing jobs that violate scheduling requirements") sge_commands = SgeCommands(remote_command_executor) # Make sure the cluster has at least 1 node in the queue so that we can verify cluster scales down correctly @@ -72,17 +73,8 @@ def _test_non_runnable_jobs(remote_command_executor, max_queue_size, max_slots, assert_that(_get_job_state(remote_command_executor, hold_job_id)).is_equal_to("hqw") logging.info("Testing cluster scales down when pending jobs cannot be submitted") - _, compute_nodes_time_series, _ = get_compute_nodes_allocation( - scheduler_commands=sge_commands, - region=region, - stack_name=cluster.cfn_name, - max_monitoring_time=minutes(scaledown_idletime) + minutes(5), - ) - assert_that(compute_nodes_time_series[-1]).is_equal_to(0) - - # Check we are not scaling up again - time.sleep(60) - assert_asg_desired_capacity(region, cluster.asg, expected=0) + assert_scaling_worked(sge_commands, region, cluster.cfn_name, scaledown_idletime, expected_max=1, expected_final=0) + # Assert jobs are still pending pending_jobs = remote_command_executor.run_remote_command("qstat -s p | tail -n +3 | awk '{ print $1 }'").stdout pending_jobs = pending_jobs.splitlines() assert_that(pending_jobs).contains(max_slots_job_id, hold_job_id) @@ -101,27 +93,29 @@ def _test_job_dependencies(remote_command_executor, region, stack_name, scaledow assert_that(_get_job_state(remote_command_executor, dependent_job_id)).is_equal_to("hqw") # Assert scaling worked as expected - jobs_execution_time = 1 - estimated_scaleup_time = 5 - max_scaledown_time = 10 - asg_capacity_time_series, compute_nodes_time_series, timestamps = get_compute_nodes_allocation( - scheduler_commands=sge_commands, - region=region, - stack_name=stack_name, - max_monitoring_time=minutes(jobs_execution_time) - + minutes(scaledown_idletime) - + minutes(estimated_scaleup_time) - + minutes(max_scaledown_time), - ) - assert_that(max(asg_capacity_time_series)).is_equal_to(1) - assert_that(max(compute_nodes_time_series)).is_equal_to(1) - assert_that(asg_capacity_time_series[-1]).is_equal_to(0) - assert_that(compute_nodes_time_series[-1]).is_equal_to(0) + assert_scaling_worked(sge_commands, region, stack_name, scaledown_idletime, expected_max=1, expected_final=0) # Assert jobs were completed sge_commands.assert_job_succeeded(job_id) sge_commands.assert_job_succeeded(dependent_job_id) +def _test_job_arrays_and_parallel_jobs(remote_command_executor, region, stack_name, scaledown_idletime): + logging.info("Testing cluster scales correctly with array jobs and parallel jobs") + sge_commands = SgeCommands(remote_command_executor) + + result = remote_command_executor.run_remote_command("echo 'sleep 1' | qsub -t 1-5", raise_on_error=False) + array_job_id = sge_commands.assert_job_submitted(result.stdout, is_array=True) + + result = remote_command_executor.run_remote_command("echo 'sleep 1' | qsub -pe mpi 4", raise_on_error=False) + parallel_job_id = sge_commands.assert_job_submitted(result.stdout) + + # Assert scaling worked as expected + assert_scaling_worked(sge_commands, region, stack_name, scaledown_idletime, expected_max=3, expected_final=0) + # Assert jobs were completed + sge_commands.assert_job_succeeded(array_job_id) + sge_commands.assert_job_succeeded(parallel_job_id) + + def _get_job_state(remote_command_executor, job_id): pending_jobs = remote_command_executor.run_remote_command("qstat | tail -n +3 | awk '{ print $1,$5 }'").stdout match = re.search(r"{0} (\w+)".format(job_id), pending_jobs) diff --git a/tests/integration-tests/tests/schedulers/test_sge/test_sge/pcluster.config.ini b/tests/integration-tests/tests/schedulers/test_sge/test_sge/pcluster.config.ini index c443684556..e0f1238bd3 100644 --- a/tests/integration-tests/tests/schedulers/test_sge/test_sge/pcluster.config.ini +++ b/tests/integration-tests/tests/schedulers/test_sge/test_sge/pcluster.config.ini @@ -22,3 +22,5 @@ scaledown_idletime = {{ scaledown_idletime }} [vpc parallelcluster-vpc] vpc_id = {{ vpc_id }} master_subnet_id = {{ public_subnet_id }} +compute_subnet_id = {{ private_subnet_id }} +use_public_ips = false diff --git a/tests/integration-tests/tests/schedulers/test_slurm.py b/tests/integration-tests/tests/schedulers/test_slurm.py index 8fe316e1c1..ce6c171156 100644 --- a/tests/integration-tests/tests/schedulers/test_slurm.py +++ b/tests/integration-tests/tests/schedulers/test_slurm.py @@ -16,11 +16,9 @@ import pytest from assertpy import assert_that -from remote_command_executor import RemoteCommandExecutor -from tests.common.assertions import assert_asg_desired_capacity -from tests.common.scaling_common import get_compute_nodes_allocation +from remote_command_executor import RemoteCommandExecutionError, RemoteCommandExecutor +from tests.common.assertions import assert_asg_desired_capacity, assert_no_errors_in_logs, assert_scaling_worked from tests.common.schedulers_common import SlurmCommands -from time_utils import minutes @pytest.mark.regions(["us-west-1"]) @@ -43,8 +41,11 @@ def test_slurm(region, pcluster_config_reader, clusters_factory): _test_dynamic_max_cluster_size(remote_command_executor, region, cluster.asg) _test_cluster_limits(remote_command_executor, max_queue_size, region, cluster.asg) _test_job_dependencies(remote_command_executor, region, cluster.cfn_name, scaledown_idletime, max_queue_size) + _test_job_arrays_and_parallel_jobs(remote_command_executor, region, cluster.cfn_name, scaledown_idletime) _test_dynamic_dummy_nodes(remote_command_executor, max_queue_size) + assert_no_errors_in_logs(remote_command_executor, ["/var/log/sqswatcher", "/var/log/jobwatcher"]) + def _test_slurm_version(remote_command_executor): logging.info("Testing Slurm Version") @@ -99,28 +100,13 @@ def _test_job_dependencies(remote_command_executor, region, stack_name, scaledow ) assert_that(_get_job_info(remote_command_executor, dependent_job_id)).contains("JobState=PENDING Reason=Dependency") - jobs_execution_time = 1 - estimated_scaleup_time = 5 - max_scaledown_time = 10 - asg_capacity_time_series, compute_nodes_time_series, timestamps = get_compute_nodes_allocation( - scheduler_commands=slurm_commands, - region=region, - stack_name=stack_name, - max_monitoring_time=minutes(jobs_execution_time) - + minutes(scaledown_idletime) - + minutes(estimated_scaleup_time) - + minutes(max_scaledown_time), - ) - assert_that(max(asg_capacity_time_series)).is_equal_to(1) - assert_that(max(compute_nodes_time_series)).is_equal_to(1) - assert_that(asg_capacity_time_series[-1]).is_equal_to(0) - assert_that(compute_nodes_time_series[-1]).is_equal_to(0) + assert_scaling_worked(slurm_commands, region, stack_name, scaledown_idletime, expected_max=1, expected_final=0) # Assert scheduler configuration is correct _assert_dummy_nodes(remote_command_executor, max_queue_size) assert_that(_retrieve_slurm_nodes_from_config(remote_command_executor)).is_empty() # Assert jobs were completed - slurm_commands.assert_job_succeeded(job_id) - slurm_commands.assert_job_succeeded(dependent_job_id) + _assert_job_completed(remote_command_executor, job_id) + _assert_job_completed(remote_command_executor, dependent_job_id) def _test_cluster_limits(remote_command_executor, max_queue_size, region, asg_name): @@ -143,6 +129,23 @@ def _test_cluster_limits(remote_command_executor, max_queue_size, region, asg_na ) +def _test_job_arrays_and_parallel_jobs(remote_command_executor, region, stack_name, scaledown_idletime): + logging.info("Testing cluster scales correctly with array jobs and parallel jobs") + slurm_commands = SlurmCommands(remote_command_executor) + + result = remote_command_executor.run_remote_command("sbatch --wrap 'sleep 1' -a 1-5") + array_job_id = slurm_commands.assert_job_submitted(result.stdout) + + result = remote_command_executor.run_remote_command("sbatch --wrap 'sleep 1' -c 3 -n 2") + parallel_job_id = slurm_commands.assert_job_submitted(result.stdout) + + # Assert scaling worked as expected + assert_scaling_worked(slurm_commands, region, stack_name, scaledown_idletime, expected_max=3, expected_final=0) + # Assert jobs were completed + _assert_job_completed(remote_command_executor, array_job_id) + _assert_job_completed(remote_command_executor, parallel_job_id) + + def _retrieve_slurm_dummy_nodes_from_config(remote_command_executor): retrieve_dummy_nodes_command = "sudo cat /opt/slurm/etc/slurm_parallelcluster_nodes.conf | head -n 1" return remote_command_executor.run_remote_command(retrieve_dummy_nodes_command).stdout @@ -172,3 +175,12 @@ def _assert_dummy_nodes(remote_command_executor, count): def _get_job_info(remote_command_executor, job_id): return remote_command_executor.run_remote_command("scontrol show jobs -o {0}".format(job_id)).stdout + + +def _assert_job_completed(remote_command_executor, job_id): + try: + result = remote_command_executor.run_remote_command("scontrol show jobs -o {0}".format(job_id), log_error=False) + return "JobState=COMPLETED" in result.stdout + except RemoteCommandExecutionError as e: + # Handle the case when job is deleted from history + assert_that(e.result.stdout).contains("slurm_load_jobs error: Invalid job id specified") diff --git a/tests/integration-tests/tests/schedulers/test_slurm/test_slurm/pcluster.config.ini b/tests/integration-tests/tests/schedulers/test_slurm/test_slurm/pcluster.config.ini index fddc2bc7f5..50a7c4cf2f 100644 --- a/tests/integration-tests/tests/schedulers/test_slurm/test_slurm/pcluster.config.ini +++ b/tests/integration-tests/tests/schedulers/test_slurm/test_slurm/pcluster.config.ini @@ -22,3 +22,5 @@ scaledown_idletime = {{ scaledown_idletime }} [vpc parallelcluster-vpc] vpc_id = {{ vpc_id }} master_subnet_id = {{ public_subnet_id }} +compute_subnet_id = {{ private_subnet_id }} +use_public_ips = false From f220a0ab40286d1bf586c6b37b61f9283ed19530 Mon Sep 17 00:00:00 2001 From: Sean Smith Date: Tue, 4 Jun 2019 10:25:27 -0700 Subject: [PATCH 100/121] Add depends on to cloudformation sg egress rule Signed-off-by: Sean Smith --- cloudformation/aws-parallelcluster.cfn.json | 1 + 1 file changed, 1 insertion(+) diff --git a/cloudformation/aws-parallelcluster.cfn.json b/cloudformation/aws-parallelcluster.cfn.json index 4b3a8f6b4a..abb1fe6e27 100644 --- a/cloudformation/aws-parallelcluster.cfn.json +++ b/cloudformation/aws-parallelcluster.cfn.json @@ -4264,6 +4264,7 @@ "Ref": "ComputeSecurityGroup" } }, + "DependsOn": "ComputeSecurityGroupEgress", "Condition": "CreateSecurityGroups" }, "ComputeSecurityGroupIngress": { From 54e6fb99ab4f9ed6d357f0c41f61c6185946b370 Mon Sep 17 00:00:00 2001 From: Sean Smith Date: Wed, 5 Jun 2019 09:38:46 -0700 Subject: [PATCH 101/121] Add Ubuntu1604 EFA Support Signed-off-by: Sean Smith --- cli/pcluster/cfnconfig.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cli/pcluster/cfnconfig.py b/cli/pcluster/cfnconfig.py index 104f1f7d5e..0b258aceeb 100644 --- a/cli/pcluster/cfnconfig.py +++ b/cli/pcluster/cfnconfig.py @@ -543,7 +543,7 @@ def __init_efa_parameters(self): valid_instances = supported_features.get("instances") self.__validate_instance("EFA", self.parameters.get("ComputeInstanceType"), valid_instances) - self.__validate_os("EFA", self.__get_os(), ["alinux", "centos7"]) + self.__validate_os("EFA", self.__get_os(), ["alinux", "centos7", "ubuntu1604"]) self.__validate_scheduler("EFA", self.__get_scheduler(), ["sge", "slurm", "torque"]) self.__validate_resource("EFA", self.parameters) self.parameters["EFA"] = __temp__ From 71e1d6e2316ec7b415b0535e8cb7a623d8ba36e7 Mon Sep 17 00:00:00 2001 From: Francesco De Martino Date: Wed, 5 Jun 2019 11:06:37 +0200 Subject: [PATCH 102/121] Integ tests: filter tests at collection time This allow a better redistribution of tests across the workers Signed-off-by: Francesco De Martino --- tests/integration-tests/conftest.py | 28 +-- tests/integration-tests/conftest_markers.py | 180 ++++++++++---------- 2 files changed, 108 insertions(+), 100 deletions(-) diff --git a/tests/integration-tests/conftest.py b/tests/integration-tests/conftest.py index 8cb2908980..192db95f4f 100644 --- a/tests/integration-tests/conftest.py +++ b/tests/integration-tests/conftest.py @@ -60,8 +60,8 @@ def pytest_addoption(parser): def pytest_generate_tests(metafunc): """Generate (multiple) parametrized calls to a test function.""" - _parametrize_from_option(metafunc, "instance", "instances") _parametrize_from_option(metafunc, "region", "regions") + _parametrize_from_option(metafunc, "instance", "instances") _parametrize_from_option(metafunc, "os", "oss") _parametrize_from_option(metafunc, "scheduler", "schedulers") @@ -90,24 +90,24 @@ def pytest_configure(config): def pytest_runtest_call(item): """Called to execute the test item.""" _add_properties_to_report(item) - add_default_markers(item) - - check_marker_list(item, "instances", "instance") - check_marker_list(item, "regions", "region") - check_marker_list(item, "oss", "os") - check_marker_list(item, "schedulers", "scheduler") - check_marker_skip_list(item, "skip_instances", "instance") - check_marker_skip_list(item, "skip_regions", "region") - check_marker_skip_list(item, "skip_oss", "os") - check_marker_skip_list(item, "skip_schedulers", "scheduler") - check_marker_dimensions(item) - check_marker_skip_dimensions(item) - logging.info("Running test " + item.name) def pytest_collection_modifyitems(items): """Called after collection has been performed, may filter or re-order the items in-place.""" + add_default_markers(items) + + check_marker_list(items, "instances", "instance") + check_marker_list(items, "regions", "region") + check_marker_list(items, "oss", "os") + check_marker_list(items, "schedulers", "scheduler") + check_marker_skip_list(items, "skip_instances", "instance") + check_marker_skip_list(items, "skip_regions", "region") + check_marker_skip_list(items, "skip_oss", "os") + check_marker_skip_list(items, "skip_schedulers", "scheduler") + check_marker_dimensions(items) + check_marker_skip_dimensions(items) + _add_filename_markers(items) diff --git a/tests/integration-tests/conftest_markers.py b/tests/integration-tests/conftest_markers.py index 5e7115abba..9d1b09885a 100644 --- a/tests/integration-tests/conftest_markers.py +++ b/tests/integration-tests/conftest_markers.py @@ -35,17 +35,18 @@ class InvalidMarkerError(Exception): pass -def add_default_markers(item): +def add_default_markers(items): """ Add default markers for dimensions that need to be skipped by default for all tests. - :param item: pytest Item object markers are applied to. + :param items: pytest Item object markers are applied to. """ - for dimensions in UNSUPPORTED_DIMENSIONS: - item.add_marker(pytest.mark.skip_dimensions(*dimensions)) + for item in items: + for dimensions in UNSUPPORTED_DIMENSIONS: + item.add_marker(pytest.mark.skip_dimensions(*dimensions)) -def check_marker_list(item, marker_name, arg_name): +def check_marker_list(items, marker_name, arg_name): """ Skip all tests that are annotated with marker marker_name and have the arg value corresponding to arg_name not listed in the list passed as first argument to the marker. @@ -56,33 +57,34 @@ def test(arg_name) The test is executed only if arg_name is equal to "value1" or "value2". - :param item: pytest Item object annotated with markers. + :param items: pytest Item objects annotated with markers. :param marker_name: name of the marker to process. :param arg_name: arg name the marker values should be compared to. """ - arg_value = item.funcargs.get(arg_name) - allowed_values = [] - for marker in item.iter_markers(name=marker_name): - _validate_marker(marker_name, [marker_name + "_list"], len(marker.args)) - allowed_values.extend(marker.args[0]) - - if not allowed_values or arg_value in allowed_values: - return - skip_message = ( - "Skipping test {test_name} because {arg_name} {arg_value} is not in {marker} allowed values: " - "{allowed_values}".format( - test_name=item.name, - arg_name=arg_name, - arg_value=arg_value, - marker=marker_name, - allowed_values=allowed_values, + for item in list(items): + arg_value = item.callspec.params.get(arg_name) + allowed_values = [] + for marker in item.iter_markers(name=marker_name): + _validate_marker(marker_name, [marker_name + "_list"], len(marker.args)) + allowed_values.extend(marker.args[0]) + + if not allowed_values or arg_value in allowed_values: + continue + skip_message = ( + "Skipping test {test_name} because {arg_name} {arg_value} is not in {marker} allowed values: " + "{allowed_values}".format( + test_name=item.name, + arg_name=arg_name, + arg_value=arg_value, + marker=marker_name, + allowed_values=allowed_values, + ) ) - ) - logging.info(skip_message) - pytest.skip(skip_message) + logging.info(skip_message) + items.remove(item) -def check_marker_skip_list(item, marker_name, arg_name): +def check_marker_skip_list(items, marker_name, arg_name): """ Skip all tests that are annotated with marker marker_name and have the arg value corresponding to arg_name listed in the list passed as first argument to the marker. @@ -93,30 +95,31 @@ def test(arg_name) The test is executed only if arg_name is not equal to "value1" or "value2". - :param item: pytest Item object annotated with markers. + :param items: pytest Item objects annotated with markers. :param marker_name: name of the marker to process. :param arg_name: arg name the marker values should be compared to. """ - arg_value = item.funcargs.get(arg_name) - for marker in item.iter_markers(name=marker_name): - _validate_marker(marker_name, [marker_name + "_skip_list"], len(marker.args)) - skip_values = marker.args[0] - if arg_value in skip_values: - skip_message = ( - "Skipping test {test_name} because {arg_name} {arg_value} is in {marker} allowed values:" - "{skip_values}".format( - test_name=item.name, - arg_name=arg_name, - arg_value=arg_value, - marker=marker_name, - skip_values=skip_values, + for item in list(items): + arg_value = item.callspec.params.get(arg_name) + for marker in item.iter_markers(name=marker_name): + _validate_marker(marker_name, [marker_name + "_skip_list"], len(marker.args)) + skip_values = marker.args[0] + if arg_value in skip_values: + skip_message = ( + "Skipping test {test_name} because {arg_name} {arg_value} is in {marker} allowed values:" + "{skip_values}".format( + test_name=item.name, + arg_name=arg_name, + arg_value=arg_value, + marker=marker_name, + skip_values=skip_values, + ) ) - ) - logging.info(skip_message) - pytest.skip(skip_message) + logging.info(skip_message) + items.remove(item) -def check_marker_skip_dimensions(item): +def check_marker_skip_dimensions(items): """ Skip all tests that are annotated with @pytest.mark.skip_dimensions and have the args (region, instance, os, scheduler) match those specified in the marker. @@ -130,34 +133,35 @@ def test(region, instance, os, scheduler) The test is executed only if the test args (region, instance, os, scheduler) do not match ("a", "b", "*", "d") - :param item: pytest Item object annotated with markers. + :param items: pytest Item objects annotated with markers. """ marker_name = "skip_dimensions" - args_values = [] - for dimension in DIMENSIONS_MARKER_ARGS: - args_values.append(item.funcargs.get(dimension)) - for marker in item.iter_markers(name=marker_name): - _validate_marker(marker_name, DIMENSIONS_MARKER_ARGS, len(marker.args)) - if len(marker.args) != len(DIMENSIONS_MARKER_ARGS): - logging.error( - "Marker {marker_name} requires the following args: {args}".format( - marker_name=marker_name, args=DIMENSIONS_MARKER_ARGS + for item in list(items): + args_values = [] + for dimension in DIMENSIONS_MARKER_ARGS: + args_values.append(item.callspec.params.get(dimension)) + for marker in item.iter_markers(name=marker_name): + _validate_marker(marker_name, DIMENSIONS_MARKER_ARGS, len(marker.args)) + if len(marker.args) != len(DIMENSIONS_MARKER_ARGS): + logging.error( + "Marker {marker_name} requires the following args: {args}".format( + marker_name=marker_name, args=DIMENSIONS_MARKER_ARGS + ) ) - ) - raise ValueError - dimensions_match = _compare_dimension_lists(args_values, marker.args) - if dimensions_match: - skip_message = ( - "Skipping test {test_name} because dimensions {args_values} match {marker}: " - "{skip_values}".format( - test_name=item.name, args_values=args_values, marker=marker_name, skip_values=marker.args + raise ValueError + dimensions_match = _compare_dimension_lists(args_values, marker.args) + if dimensions_match: + skip_message = ( + "Skipping test {test_name} because dimensions {args_values} match {marker}: " + "{skip_values}".format( + test_name=item.name, args_values=args_values, marker=marker_name, skip_values=marker.args + ) ) - ) - logging.info(skip_message) - pytest.skip(skip_message) + logging.info(skip_message) + items.remove(item) -def check_marker_dimensions(item): +def check_marker_dimensions(items): """ Execute all tests that are annotated with @pytest.mark.dimensions and have the args (region, instance, os, scheduler) match those specified in the marker. @@ -170,29 +174,33 @@ def test(region, instance, os, scheduler) The test is executed only if the test args (region, instance, os, scheduler) match ("a", "b", "*", "d") - :param item: pytest Item object annotated with markers. + :param items: pytest Item objects annotated with markers. """ marker_name = "dimensions" - test_args_value = [] - for dimension in DIMENSIONS_MARKER_ARGS: - test_args_value.append(item.funcargs.get(dimension)) - allowed_values = [] - for marker in item.iter_markers(name=marker_name): - _validate_marker(marker_name, DIMENSIONS_MARKER_ARGS, len(marker.args)) - allowed_values.append(marker.args) - dimensions_match = _compare_dimension_lists(test_args_value, marker.args) - if dimensions_match: - return - - if allowed_values: - skip_message = ( - "Skipping test {test_name} because dimensions {test_args_value} do not match any marker {marker} values: " - "{allowed_values}".format( - test_name=item.name, test_args_value=test_args_value, marker=marker_name, allowed_values=allowed_values + for item in items: + test_args_value = [] + for dimension in DIMENSIONS_MARKER_ARGS: + test_args_value.append(item.callspec.params.get(dimension)) + allowed_values = [] + for marker in item.iter_markers(name=marker_name): + _validate_marker(marker_name, DIMENSIONS_MARKER_ARGS, len(marker.args)) + allowed_values.append(marker.args) + dimensions_match = _compare_dimension_lists(test_args_value, marker.args) + if dimensions_match: + continue + + if allowed_values: + skip_message = ( + "Skipping test {test_name} because dimensions {test_args_value} do not match any marker {marker}" + " values: {allowed_values}".format( + test_name=item.name, + test_args_value=test_args_value, + marker=marker_name, + allowed_values=allowed_values, + ) ) - ) - logging.info(skip_message) - pytest.skip(skip_message) + logging.info(skip_message) + items.remove(item) def _validate_marker(marker_name, expected_args, args_count): From 5c7484bbfa64821f39ddd31d7383fae2975b2018 Mon Sep 17 00:00:00 2001 From: Luca Carrogu Date: Wed, 5 Jun 2019 16:15:07 +0200 Subject: [PATCH 103/121] Improve custom AMI docs Signed-off-by: Luca Carrogu --- docs/tutorials/02_ami_customization.rst | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) diff --git a/docs/tutorials/02_ami_customization.rst b/docs/tutorials/02_ami_customization.rst index 28c2e3f3cf..35c0c90894 100644 --- a/docs/tutorials/02_ami_customization.rst +++ b/docs/tutorials/02_ami_customization.rst @@ -23,9 +23,16 @@ tutorial will guide you through the process. How to customize the AWS ParallelCluster AMI ============================================ -There are three ways to use a custom AWS ParallelCluster AMI, two of them require to build a new AMI that will be -available under your AWS account and one does not require to build anything in advance. Feel free to select the -appropriate method based on your needs. +There are three alternative ways to use a custom AWS ParallelCluster AMI, two of them require to build a new AMI that +will be available under your AWS account and one does not require to build anything in advance: + +- modify an AWS ParallelCluster AMI, when you want to install your software on top of an official AWS ParalleCluster AMI +- build a custom AWS ParallelCluster AMI, when you have an AMI with customization and software already in place, and + want to build an AWS ParalleCluster AMI on top of it +- use a Custom AMI at runtime, when you don't want to create anything in advance, AWS ParallelCluster will install + everything it needs at runtime (during cluster creation time and scale-up time) + +Feel free to select the appropriate method based on your needs. Modify an AWS ParallelCluster AMI --------------------------------- @@ -35,10 +42,13 @@ the components required for AWS ParallelCluster to function installed and config base. #. Find the AMI which corresponds to the region you will be utilizing from the AMI list. - The AMI list to use must match the version of the product e.g. + .. warning:: + The AMI list to use must match the version of AWS ParallelCluster, for example: - - for ParallelCluster 2.0.2 -> https://github.com/aws/aws-parallelcluster/blob/v2.0.2/amis.txt - - for CfnCluster 1.6.1 -> https://github.com/aws/aws-parallelcluster/blob/v1.6.1/amis.txt + - for AWS ParallelCluster 2.3.1 -> https://github.com/aws/aws-parallelcluster/blob/v2.3.1/amis.txt + - for AWS ParallelCluster 2.2.1 -> https://github.com/aws/aws-parallelcluster/blob/v2.2.1/amis.txt + - for AWS ParallelCluster 2.1.1 -> https://github.com/aws/aws-parallelcluster/blob/v2.1.1/amis.txt + - for CfnCluster 1.6.1 -> https://github.com/aws/aws-parallelcluster/blob/v1.6.1/amis.txt #. Within the EC2 Console, choose "Launch Instance". #. Navigate to "Community AMIs", and enter the AMI id for your region into the search box. @@ -77,6 +87,9 @@ starting from the one you provide as base:: pcluster createami --ami-id --os +.. warning:: + You cannot use a ParalleCluster AMI as for the create command or the create will fail. + For other parameters, please consult the command help:: pcluster createami -h From e749598640d1ea53b1f8c9636c56f9459c338f5f Mon Sep 17 00:00:00 2001 From: Francesco De Martino Date: Thu, 6 Jun 2019 14:19:43 +0200 Subject: [PATCH 104/121] integ tests - slurm: increase waiting time for cluster size update related to: https://github.com/aws/aws-parallelcluster-node/pull/151 Signed-off-by: Francesco De Martino --- tests/integration-tests/tests/schedulers/test_sge.py | 1 + tests/integration-tests/tests/schedulers/test_slurm.py | 6 ++++-- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/tests/integration-tests/tests/schedulers/test_sge.py b/tests/integration-tests/tests/schedulers/test_sge.py index c2b67556cb..4f37c23783 100644 --- a/tests/integration-tests/tests/schedulers/test_sge.py +++ b/tests/integration-tests/tests/schedulers/test_sge.py @@ -41,6 +41,7 @@ def test_sge(region, pcluster_config_reader, clusters_factory): _test_non_runnable_jobs(remote_command_executor, max_queue_size, max_slots, region, cluster, scaledown_idletime) _test_job_dependencies(remote_command_executor, region, cluster.cfn_name, scaledown_idletime) _test_job_arrays_and_parallel_jobs(remote_command_executor, region, cluster.cfn_name, scaledown_idletime) + # TODO: _test_dynamic_max_cluster_size assert_no_errors_in_logs(remote_command_executor, ["/var/log/sqswatcher", "/var/log/jobwatcher"]) diff --git a/tests/integration-tests/tests/schedulers/test_slurm.py b/tests/integration-tests/tests/schedulers/test_slurm.py index ce6c171156..4c68423c81 100644 --- a/tests/integration-tests/tests/schedulers/test_slurm.py +++ b/tests/integration-tests/tests/schedulers/test_slurm.py @@ -65,12 +65,14 @@ def _test_dynamic_max_cluster_size(remote_command_executor, region, asg_name): # Change ASG value and check dummy-nodes settings new_max_size = 1 asg_client.update_auto_scaling_group(AutoScalingGroupName=asg_name, MaxSize=new_max_size) - time.sleep(40) + # sleeping for 200 seconds since daemons fetch this data every 3 minutes + time.sleep(200) _assert_dummy_nodes(remote_command_executor, new_max_size) # Restore initial cluster size asg_client.update_auto_scaling_group(AutoScalingGroupName=asg_name, MaxSize=current_max_size) - time.sleep(40) + # sleeping for 200 seconds since daemons fetch this data every 3 minutes + time.sleep(200) _assert_dummy_nodes(remote_command_executor, current_max_size) From 0f8a1171a67420016730270c3f80d5f05b35fff9 Mon Sep 17 00:00:00 2001 From: lexaf Date: Wed, 5 Jun 2019 13:43:51 -0700 Subject: [PATCH 105/121] Add env override for default config file location. Add environment variable override, `AWS_PARALLELCLUSTER_CONFIG_FILE`, for default configuration file location. Intended as an analogy to the standard `AWS_SHARED_CREDENTIALS_FILE`, allowing environment-based specification of a parallelcluster target configuration. Minor update to Configuration doc section to clarify configuration file options. --- cli/pcluster/cfnconfig.py | 5 ++++- docs/configuration.rst | 2 ++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/cli/pcluster/cfnconfig.py b/cli/pcluster/cfnconfig.py index 0b258aceeb..2dc62accd2 100644 --- a/cli/pcluster/cfnconfig.py +++ b/cli/pcluster/cfnconfig.py @@ -131,10 +131,13 @@ def __init_config(self): :return: configuration object """ - # Determine config file name based on args or default + # Determine config file name based on args, env or default if hasattr(self.args, "config_file") and self.args.config_file is not None: config_file = self.args.config_file default_config = False + elif "AWS_PARALLELCLUSTER_CONFIG_FILE" in os.environ: + config_file = os.environ["AWS_PARALLELCLUSTER_CONFIG_FILE"] + default_config = False else: config_file = os.path.expanduser(os.path.join("~", ".parallelcluster", "config")) default_config = True diff --git a/docs/configuration.rst b/docs/configuration.rst index e9adafe6cc..fb147437ae 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -3,6 +3,8 @@ Configuration .. toctree:: ParallelCluster uses the file ``~/.parallelcluster/config`` by default for all configuration parameters. +You can change the location of the config file via the ``--config`` command option or by setting the +AWS_PARALLELCLUSTER_CONFIG_FILE environment variable. An example configuration file can be found at ``site-packages/aws-parallelcluster/examples/config``. From c36b8bfa99285bc996669744faec7b5b5a125349 Mon Sep 17 00:00:00 2001 From: lexaf Date: Wed, 5 Jun 2019 15:15:45 -0700 Subject: [PATCH 106/121] Fix broken link in docs/getting_started.rst --- docs/getting_started.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/getting_started.rst b/docs/getting_started.rst index e0c97d09cf..1346dada2d 100644 --- a/docs/getting_started.rst +++ b/docs/getting_started.rst @@ -152,7 +152,7 @@ Next, a simple cluster launches into a VPC and uses an existing subnet which sup for the subnet is :code:`0.0.0.0/0 => igw-xxxxxx`. The VPC must have :code:`DNS Resolution = yes` and :code:`DNS Hostnames = yes`. It should also have DHCP options with the correct :code:`domain-name` for the region, as defined in the docs: `VPC DHCP -Options `_. +Options `_. Once all of those settings contain valid values, you can launch the cluster by running the create command: From 714528c76423d4f811c677c5aa86c786a7e41227 Mon Sep 17 00:00:00 2001 From: Francesco De Martino Date: Fri, 7 Jun 2019 09:56:52 +0200 Subject: [PATCH 107/121] integ tests: fix tests collection Signed-off-by: Francesco De Martino --- tests/integration-tests/conftest_markers.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/integration-tests/conftest_markers.py b/tests/integration-tests/conftest_markers.py index 9d1b09885a..8f3ce8a80e 100644 --- a/tests/integration-tests/conftest_markers.py +++ b/tests/integration-tests/conftest_markers.py @@ -159,6 +159,7 @@ def test(region, instance, os, scheduler) ) logging.info(skip_message) items.remove(item) + break def check_marker_dimensions(items): From d850a82cfa00e5c11c248d41939cdc52a60e9670 Mon Sep 17 00:00:00 2001 From: Francesco De Martino Date: Fri, 7 Jun 2019 10:24:45 +0200 Subject: [PATCH 108/121] integ tests: clone items list on tests collection Signed-off-by: Francesco De Martino --- tests/integration-tests/conftest_markers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration-tests/conftest_markers.py b/tests/integration-tests/conftest_markers.py index 8f3ce8a80e..f31d74aae9 100644 --- a/tests/integration-tests/conftest_markers.py +++ b/tests/integration-tests/conftest_markers.py @@ -178,7 +178,7 @@ def test(region, instance, os, scheduler) :param items: pytest Item objects annotated with markers. """ marker_name = "dimensions" - for item in items: + for item in list(items): test_args_value = [] for dimension in DIMENSIONS_MARKER_ARGS: test_args_value.append(item.callspec.params.get(dimension)) From dbe2fd176e69fe61b13112c7d0d4ec3473f6201c Mon Sep 17 00:00:00 2001 From: Francesco De Martino Date: Fri, 7 Jun 2019 10:52:48 +0200 Subject: [PATCH 109/121] integ tests: fix check_marker_dimensions Signed-off-by: Francesco De Martino --- tests/integration-tests/conftest_markers.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/integration-tests/conftest_markers.py b/tests/integration-tests/conftest_markers.py index f31d74aae9..9116785ef2 100644 --- a/tests/integration-tests/conftest_markers.py +++ b/tests/integration-tests/conftest_markers.py @@ -183,14 +183,15 @@ def test(region, instance, os, scheduler) for dimension in DIMENSIONS_MARKER_ARGS: test_args_value.append(item.callspec.params.get(dimension)) allowed_values = [] + dimensions_match = False for marker in item.iter_markers(name=marker_name): _validate_marker(marker_name, DIMENSIONS_MARKER_ARGS, len(marker.args)) allowed_values.append(marker.args) dimensions_match = _compare_dimension_lists(test_args_value, marker.args) if dimensions_match: - continue + break - if allowed_values: + if not dimensions_match and allowed_values: skip_message = ( "Skipping test {test_name} because dimensions {test_args_value} do not match any marker {marker}" " values: {allowed_values}".format( From abaa4d638edc79a6eb5c247264096f22362d60d3 Mon Sep 17 00:00:00 2001 From: Sean Smith Date: Tue, 28 May 2019 07:52:56 -0700 Subject: [PATCH 110/121] EFA Integration Test Signed-off-by: Sean Smith --- .gitignore | 1 + .../tests/common/schedulers_common.py | 50 +++++++++++++-- .../tests/test_efa/test_efa.py | 64 +++++++++++++++++++ .../test_efa/test_efa/mpi_hello_world.c | 35 ++++++++++ .../test_efa/test_efa/osu_benchmarks.sh | 0 .../test_efa/test_efa/pcluster.config.ini | 22 +++++++ .../test_efa/test_efa/sge_osu_submit.sh | 0 .../test_efa/test_efa/test_efa/sge_submit.sh | 5 ++ 8 files changed, 171 insertions(+), 6 deletions(-) create mode 100644 tests/integration-tests/tests/test_efa/test_efa.py create mode 100644 tests/integration-tests/tests/test_efa/test_efa/test_efa/mpi_hello_world.c create mode 100644 tests/integration-tests/tests/test_efa/test_efa/test_efa/osu_benchmarks.sh create mode 100644 tests/integration-tests/tests/test_efa/test_efa/test_efa/pcluster.config.ini create mode 100644 tests/integration-tests/tests/test_efa/test_efa/test_efa/sge_osu_submit.sh create mode 100644 tests/integration-tests/tests/test_efa/test_efa/test_efa/sge_submit.sh diff --git a/.gitignore b/.gitignore index ae57b2eea5..7eaf8ce5b6 100644 --- a/.gitignore +++ b/.gitignore @@ -13,3 +13,4 @@ build/ .coverage assets/ report.html +tests_outputs/ diff --git a/tests/integration-tests/tests/common/schedulers_common.py b/tests/integration-tests/tests/common/schedulers_common.py index fbf49a0f00..d230203404 100644 --- a/tests/integration-tests/tests/common/schedulers_common.py +++ b/tests/integration-tests/tests/common/schedulers_common.py @@ -55,6 +55,16 @@ def get_job_exit_status(self, job_id): """ pass + @abstractmethod + def submit_interactive_command(self, command, nodes=1): + """ + Submit a interactive command to the scheduler. + + :param command: command to submit. + :return: result from remote command execution. + """ + pass + @abstractmethod def submit_command(self, command, nodes=1): """ @@ -66,7 +76,7 @@ def submit_command(self, command, nodes=1): pass @abstractmethod - def submit_script(self, script, nodes=1): + def submit_script(self, script, nodes=1, additional_files=None): """ Submit a job to the scheduler by using a script file. @@ -115,10 +125,13 @@ def assert_job_submitted(self, awsbsub_output): # noqa: D102 assert_that(match).is_not_none() return match.group(1) + def submit_interactive_command(self, command, nodes=1): # noqa: D102 + raise NotImplementedError + def submit_command(self, command, nodes=1): # noqa: D102 return self._remote_command_executor.run_remote_command('echo "{0}" | awsbsub -n {1}'.format(command, nodes)) - def submit_script(self, script, nodes=1): # noqa: D102 + def submit_script(self, script, nodes=1, additional_files=None): # noqa: D102 raise NotImplementedError def assert_job_succeeded(self, job_id, children_number=0): # noqa: D102 @@ -161,6 +174,16 @@ def assert_job_submitted(self, qsub_output, is_array=False): # noqa: D102 assert_that(match).is_not_none() return match.group(1) + def submit_interactive_command(self, command, nodes=1, slots=None): # noqa: D102 + flags = "" + if nodes != 1: + raise Exception("SGE does not support nodes option") + if slots: + flags += "-pe mpi {0} ".format(slots) + return self._remote_command_executor.run_remote_command( + "echo '{0}' | qrsh {1}".format(command, flags), raise_on_error=False + ) + def submit_command(self, command, nodes=1, slots=None, hold=False): # noqa: D102 flags = "" if nodes != 1: @@ -173,10 +196,13 @@ def submit_command(self, command, nodes=1, slots=None, hold=False): # noqa: D10 "echo '{0}' | qsub {1}".format(command, flags), raise_on_error=False ) - def submit_script(self, script, nodes=1): # noqa: D102 + def submit_script(self, script, nodes=1, additional_files=None): # noqa: D102 + if not additional_files: + additional_files = [] + additional_files.append(script) script_name = os.path.basename(script) return self._remote_command_executor.run_remote_command( - "qsub {0}".format(script_name), additional_files=[script] + "qsub {0}".format(script_name), additional_files=additional_files ) def assert_job_succeeded(self, job_id, children_number=0): # noqa: D102 @@ -217,19 +243,28 @@ def assert_job_submitted(self, sbatch_output): # noqa: D102 assert_that(match).is_not_none() return match.group(1) + def submit_interactive_command(self, command, nodes=1, host=None): # noqa: D102 + submission_command = "srun -N {0} --wrap='{1}'".format(nodes, command) + if host: + submission_command += " --nodelist={0}".format(host) + return self._remote_command_executor.run_remote_command(submission_command) + def submit_command(self, command, nodes=1, host=None): # noqa: D102 submission_command = "sbatch -N {0} --wrap='{1}'".format(nodes, command) if host: submission_command += " --nodelist={0}".format(host) return self._remote_command_executor.run_remote_command(submission_command) - def submit_script(self, script, nodes=1, host=None): # noqa: D102 + def submit_script(self, script, nodes=1, host=None, additional_files=None): # noqa: D102 + if not additional_files: + additional_files = [] + additional_files.append(script) script_name = os.path.basename(script) submission_command = "sbatch" if host: submission_command += " --nodelist={0}".format(host) submission_command += " -N {0} {1}".format(nodes, script_name) - return self._remote_command_executor.run_remote_command(submission_command, additional_files=[script]) + return self._remote_command_executor.run_remote_command(submission_command, additional_files=additional_files) def assert_job_succeeded(self, job_id, children_number=0): # noqa: D102 result = self._remote_command_executor.run_remote_command("scontrol show jobs -o {0}".format(job_id)) @@ -262,6 +297,9 @@ def get_job_exit_status(self, job_id): # noqa: D102 def assert_job_submitted(self, qsub_output): # noqa: D102 raise NotImplementedError + def submit_interactive_command(self, command, nodes=1): # noqa: D102 + raise NotImplementedError + def submit_command(self, command): # noqa: D102 raise NotImplementedError diff --git a/tests/integration-tests/tests/test_efa/test_efa.py b/tests/integration-tests/tests/test_efa/test_efa.py new file mode 100644 index 0000000000..3b7a7aa2b1 --- /dev/null +++ b/tests/integration-tests/tests/test_efa/test_efa.py @@ -0,0 +1,64 @@ +# Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). +# You may not use this file except in compliance with the License. +# A copy of the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "LICENSE.txt" file accompanying this file. +# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied. +# See the License for the specific language governing permissions and limitations under the License. +import logging + +import pytest + +from assertpy import assert_that +from remote_command_executor import RemoteCommandExecutor +from tests.common.schedulers_common import get_scheduler_commands + + +@pytest.mark.regions(["us-east-1"]) +@pytest.mark.instances(["c5n.18xlarge", "p3dn.24xlarge", "i3en.24xlarge"]) +@pytest.mark.oss(["alinux", "centos7"]) +@pytest.mark.schedulers(["sge", "slurm"]) +@pytest.mark.usefixtures("os", "instance", "scheduler") +def test_efa(scheduler, pcluster_config_reader, clusters_factory, test_datadir): + """ + Test all EFA Features. + + Grouped all tests in a single function so that cluster can be reused for all of them. + """ + scaledown_idletime = 3 + max_queue_size = 5 + cluster_config = pcluster_config_reader(scaledown_idletime=scaledown_idletime, max_queue_size=max_queue_size) + cluster = clusters_factory(cluster_config) + remote_command_executor = RemoteCommandExecutor(cluster) + + _test_efa_installed(remote_command_executor) + _test_efa_mpi(remote_command_executor, scheduler, test_datadir) + + +def _test_efa_installed(remote_command_executor, scheduler): + # Output contains: + # 00:06.0 Ethernet controller: Amazon.com, Inc. Device efa0 + logging.info("Testing EFA Installed") + scheduler_commands = get_scheduler_commands(scheduler, remote_command_executor) + result = scheduler_commands.submit_interactive_command("/sbin/lspci") + assert_that(result.stdout).contains("00:06.0 Ethernet controller: Amazon.com, Inc. Device efa0") + + +def _test_efa_mpi(remote_command_executor, scheduler, test_datadir): + logging.info("Testing EFA Installed") + # Compile mpi script + result = remote_command_executor.run_remote_command( + "/opt/amazon/efa/bin/mpicc -o mpi_hello_world mpi_hello_world.c", + additional_files=[str(test_datadir / "mpi_hello_world.c")], + ).stdout + + # submit script using additional files + scheduler_commands = get_scheduler_commands(scheduler, remote_command_executor) + + result = scheduler_commands.submit_script(str(test_datadir / "{0}_submit.sh".format(scheduler))) + job_id = scheduler_commands.assert_job_submitted(result.stdout) + scheduler_commands.wait_job_completed(job_id) diff --git a/tests/integration-tests/tests/test_efa/test_efa/test_efa/mpi_hello_world.c b/tests/integration-tests/tests/test_efa/test_efa/test_efa/mpi_hello_world.c new file mode 100644 index 0000000000..3a4dc33712 --- /dev/null +++ b/tests/integration-tests/tests/test_efa/test_efa/test_efa/mpi_hello_world.c @@ -0,0 +1,35 @@ +// Copyright 2011 www.mpitutorial.com +// +// An intro MPI hello world program that uses MPI_Init, MPI_Comm_size, +// MPI_Comm_rank, MPI_Finalize, and MPI_Get_processor_name. +// +#include +#include +#include + +int main(int argc, char** argv) { + // Initialize the MPI environment. The two arguments to MPI Init are not + // currently used by MPI implementations, but are there in case future + // implementations might need the arguments. + MPI_Init(NULL, NULL); + + // Get the number of processes + int world_size; + MPI_Comm_size(MPI_COMM_WORLD, &world_size); + + // Get the rank of the process + int world_rank; + MPI_Comm_rank(MPI_COMM_WORLD, &world_rank); + + // Get the name of the processor + char processor_name[MPI_MAX_PROCESSOR_NAME]; + int name_len; + MPI_Get_processor_name(processor_name, &name_len); + + // Print off a hello world message + printf("Hello world from processor %s, rank %d out of %d processors\n", + processor_name, world_rank, world_size); + + // Finalize the MPI environment. No more MPI calls can be made after this + MPI_Finalize(); +} \ No newline at end of file diff --git a/tests/integration-tests/tests/test_efa/test_efa/test_efa/osu_benchmarks.sh b/tests/integration-tests/tests/test_efa/test_efa/test_efa/osu_benchmarks.sh new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/integration-tests/tests/test_efa/test_efa/test_efa/pcluster.config.ini b/tests/integration-tests/tests/test_efa/test_efa/test_efa/pcluster.config.ini new file mode 100644 index 0000000000..46047457b4 --- /dev/null +++ b/tests/integration-tests/tests/test_efa/test_efa/test_efa/pcluster.config.ini @@ -0,0 +1,22 @@ +[global] +cluster_template = default + +[aws] +aws_region_name = {{ region }} + +[cluster default] +base_os = {{ os }} +key_name = {{ key_name }} +vpc_settings = parallelcluster-vpc +scheduler = {{ scheduler }} +master_instance_type = t2.micro +compute_instance_type = {{ instance }} +initial_queue_size = 2 +maintain_initial_size = true +enable_efa = compute +placement_group = DYNAMIC + +[vpc parallelcluster-vpc] +vpc_id = {{ vpc_id }} +master_subnet_id = {{ public_subnet_id }} +compute_subnet_id = {{ private_subnet_id }} diff --git a/tests/integration-tests/tests/test_efa/test_efa/test_efa/sge_osu_submit.sh b/tests/integration-tests/tests/test_efa/test_efa/test_efa/sge_osu_submit.sh new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/integration-tests/tests/test_efa/test_efa/test_efa/sge_submit.sh b/tests/integration-tests/tests/test_efa/test_efa/test_efa/sge_submit.sh new file mode 100644 index 0000000000..e2b38c4c1a --- /dev/null +++ b/tests/integration-tests/tests/test_efa/test_efa/test_efa/sge_submit.sh @@ -0,0 +1,5 @@ +#!/bin/bash +#$ -pe mpi 144 + +module load openmpi +mpirun -N 1 -np 2 "mpi_hello_world" \ No newline at end of file From 4a81d770922d36c0d55feb3284ecd9614e549b61 Mon Sep 17 00:00:00 2001 From: Sean Smith Date: Thu, 6 Jun 2019 14:40:14 -0700 Subject: [PATCH 111/121] Add OSU Benchmarks to Integration Tests Signed-off-by: Sean Smith --- .../tests/test_efa/test_efa.py | 21 ++++++++++++++++++- .../test_efa/test_efa/osu_benchmarks.sh | 9 ++++++++ .../test_efa/test_efa/sge_osu_submit.sh | 9 ++++++++ 3 files changed, 38 insertions(+), 1 deletion(-) diff --git a/tests/integration-tests/tests/test_efa/test_efa.py b/tests/integration-tests/tests/test_efa/test_efa.py index 3b7a7aa2b1..9ef0896b1c 100644 --- a/tests/integration-tests/tests/test_efa/test_efa.py +++ b/tests/integration-tests/tests/test_efa/test_efa.py @@ -20,7 +20,7 @@ @pytest.mark.regions(["us-east-1"]) @pytest.mark.instances(["c5n.18xlarge", "p3dn.24xlarge", "i3en.24xlarge"]) -@pytest.mark.oss(["alinux", "centos7"]) +@pytest.mark.oss(["alinux", "centos7", "ubuntu1604"]) @pytest.mark.schedulers(["sge", "slurm"]) @pytest.mark.usefixtures("os", "instance", "scheduler") def test_efa(scheduler, pcluster_config_reader, clusters_factory, test_datadir): @@ -55,6 +55,7 @@ def _test_efa_mpi(remote_command_executor, scheduler, test_datadir): "/opt/amazon/efa/bin/mpicc -o mpi_hello_world mpi_hello_world.c", additional_files=[str(test_datadir / "mpi_hello_world.c")], ).stdout + logging.info(result) # submit script using additional files scheduler_commands = get_scheduler_commands(scheduler, remote_command_executor) @@ -62,3 +63,21 @@ def _test_efa_mpi(remote_command_executor, scheduler, test_datadir): result = scheduler_commands.submit_script(str(test_datadir / "{0}_submit.sh".format(scheduler))) job_id = scheduler_commands.assert_job_submitted(result.stdout) scheduler_commands.wait_job_completed(job_id) + scheduler_commands.assert_job_succeeded(job_id) + + +def _test_osu_benchmarks(remote_command_executor, scheduler, test_datadir): + logging.info("Testing EFA Installed") + # Compile mpi script + result = remote_command_executor.run_remote_command( + "/bin/bash osu_benchmarks.sh", additional_files=[str(test_datadir / "osu_benchmarks.sh")] + ).stdout + logging.info(result) + + # submit script using additional files + scheduler_commands = get_scheduler_commands(scheduler, remote_command_executor) + + result = scheduler_commands.submit_script(str(test_datadir / "{0}_submit_osu_benchmarks.sh".format(scheduler))) + job_id = scheduler_commands.assert_job_submitted(result.stdout) + scheduler_commands.wait_job_completed(job_id) + scheduler_commands.assert_job_succeeded(job_id) diff --git a/tests/integration-tests/tests/test_efa/test_efa/test_efa/osu_benchmarks.sh b/tests/integration-tests/tests/test_efa/test_efa/test_efa/osu_benchmarks.sh index e69de29bb2..8d6abb3293 100644 --- a/tests/integration-tests/tests/test_efa/test_efa/test_efa/osu_benchmarks.sh +++ b/tests/integration-tests/tests/test_efa/test_efa/test_efa/osu_benchmarks.sh @@ -0,0 +1,9 @@ +#!/usr/bin/env bash + +cd /shared +wget http://mvapich.cse.ohio-state.edu/download/mvapich/osu-micro-benchmarks-5.4.tar.gz +tar zxvf ./osu-micro-benchmarks-5.4.tar.gz +cd osu-micro-benchmarks-5.4/ +./configure CC=/opt/amazon/efa/bin/mpicc CXX=/opt/amazon/efa/bin/mpicxx +make +# make install in the submit script \ No newline at end of file diff --git a/tests/integration-tests/tests/test_efa/test_efa/test_efa/sge_osu_submit.sh b/tests/integration-tests/tests/test_efa/test_efa/test_efa/sge_osu_submit.sh index e69de29bb2..17fecd2021 100644 --- a/tests/integration-tests/tests/test_efa/test_efa/test_efa/sge_osu_submit.sh +++ b/tests/integration-tests/tests/test_efa/test_efa/test_efa/sge_osu_submit.sh @@ -0,0 +1,9 @@ +#!/bin/bash +#$ -pe mpi 144 + +cd /shared/osu-micro-benchmarks-5.4 +sudo make install # on all compute nodes + +# actually run the benchmark +module load openmpi +mpirun -N 1 -np 2 /usr/local/libexec/osu-micro-benchmarks/mpi/pt2pt/osu_latency \ No newline at end of file From e997919196ed0f479d44f1b9761ae73e8322ea43 Mon Sep 17 00:00:00 2001 From: Francesco De Martino Date: Fri, 7 Jun 2019 18:06:35 +0200 Subject: [PATCH 112/121] integ tests: various fixes for EFA tests Signed-off-by: Francesco De Martino --- .../remote_command_executor.py | 8 ++- .../tests/common/schedulers_common.py | 61 +++++----------- .../tests/test_efa/test_efa.py | 72 +++++++++++-------- ...u_benchmarks.sh => init_osu_benchmarks.sh} | 3 +- .../test_efa/test_efa/mpi_hello_world.c | 2 +- .../test_efa/test_efa/test_efa/mpi_submit.sh | 5 ++ .../test_efa/test_efa/test_efa/osu_submit.sh | 5 ++ .../test_efa/test_efa/pcluster.config.ini | 3 +- .../test_efa/test_efa/sge_osu_submit.sh | 9 --- .../test_efa/test_efa/test_efa/sge_submit.sh | 5 -- 10 files changed, 82 insertions(+), 91 deletions(-) rename tests/integration-tests/tests/test_efa/test_efa/test_efa/{osu_benchmarks.sh => init_osu_benchmarks.sh} (86%) create mode 100644 tests/integration-tests/tests/test_efa/test_efa/test_efa/mpi_submit.sh create mode 100644 tests/integration-tests/tests/test_efa/test_efa/test_efa/osu_submit.sh delete mode 100644 tests/integration-tests/tests/test_efa/test_efa/test_efa/sge_osu_submit.sh delete mode 100644 tests/integration-tests/tests/test_efa/test_efa/test_efa/sge_submit.sh diff --git a/tests/integration-tests/remote_command_executor.py b/tests/integration-tests/remote_command_executor.py index 46d68ef778..ee7f5a7d39 100644 --- a/tests/integration-tests/remote_command_executor.py +++ b/tests/integration-tests/remote_command_executor.py @@ -84,7 +84,7 @@ def run_remote_command( raise RemoteCommandExecutionError(result) return result - def run_remote_script(self, script_file, args=None, log_error=True, additional_files=None): + def run_remote_script(self, script_file, args=None, log_error=True, additional_files=None, hide=False): """ Execute a script remotely on the cluster master node. @@ -93,6 +93,7 @@ def run_remote_script(self, script_file, args=None, log_error=True, additional_f :param args: args to pass to the script when invoked. :param log_error: log errors. :param additional_files: additional files to copy before executing script. + :param hide: do not print command output to the local stdout :return: result of the execution. """ script_name = os.path.basename(script_file) @@ -100,7 +101,10 @@ def run_remote_script(self, script_file, args=None, log_error=True, additional_f if not args: args = [] return self.run_remote_command( - ["/bin/bash", "--login", script_name] + args, log_error=log_error, additional_files=additional_files + ["/bin/bash", "--login", script_name] + args, + log_error=log_error, + additional_files=additional_files, + hide=hide, ) def _copy_additional_files(self, files): diff --git a/tests/integration-tests/tests/common/schedulers_common.py b/tests/integration-tests/tests/common/schedulers_common.py index d230203404..7fe2762c18 100644 --- a/tests/integration-tests/tests/common/schedulers_common.py +++ b/tests/integration-tests/tests/common/schedulers_common.py @@ -56,17 +56,7 @@ def get_job_exit_status(self, job_id): pass @abstractmethod - def submit_interactive_command(self, command, nodes=1): - """ - Submit a interactive command to the scheduler. - - :param command: command to submit. - :return: result from remote command execution. - """ - pass - - @abstractmethod - def submit_command(self, command, nodes=1): + def submit_command(self, command, nodes=1, slots=None): """ Submit a job to the scheduler. @@ -76,7 +66,7 @@ def submit_command(self, command, nodes=1): pass @abstractmethod - def submit_script(self, script, nodes=1, additional_files=None): + def submit_script(self, script, nodes=1, slots=None, additional_files=None): """ Submit a job to the scheduler by using a script file. @@ -125,13 +115,10 @@ def assert_job_submitted(self, awsbsub_output): # noqa: D102 assert_that(match).is_not_none() return match.group(1) - def submit_interactive_command(self, command, nodes=1): # noqa: D102 - raise NotImplementedError - - def submit_command(self, command, nodes=1): # noqa: D102 + def submit_command(self, command, nodes=1, slots=None): # noqa: D102 return self._remote_command_executor.run_remote_command('echo "{0}" | awsbsub -n {1}'.format(command, nodes)) - def submit_script(self, script, nodes=1, additional_files=None): # noqa: D102 + def submit_script(self, script, nodes=1, additional_files=None, slots=None): # noqa: D102 raise NotImplementedError def assert_job_succeeded(self, job_id, children_number=0): # noqa: D102 @@ -174,16 +161,6 @@ def assert_job_submitted(self, qsub_output, is_array=False): # noqa: D102 assert_that(match).is_not_none() return match.group(1) - def submit_interactive_command(self, command, nodes=1, slots=None): # noqa: D102 - flags = "" - if nodes != 1: - raise Exception("SGE does not support nodes option") - if slots: - flags += "-pe mpi {0} ".format(slots) - return self._remote_command_executor.run_remote_command( - "echo '{0}' | qrsh {1}".format(command, flags), raise_on_error=False - ) - def submit_command(self, command, nodes=1, slots=None, hold=False): # noqa: D102 flags = "" if nodes != 1: @@ -196,13 +173,16 @@ def submit_command(self, command, nodes=1, slots=None, hold=False): # noqa: D10 "echo '{0}' | qsub {1}".format(command, flags), raise_on_error=False ) - def submit_script(self, script, nodes=1, additional_files=None): # noqa: D102 + def submit_script(self, script, nodes=1, slots=None, additional_files=None): # noqa: D102 if not additional_files: additional_files = [] additional_files.append(script) + flags = "" + if slots: + flags += "-pe mpi {0} ".format(slots) script_name = os.path.basename(script) return self._remote_command_executor.run_remote_command( - "qsub {0}".format(script_name), additional_files=additional_files + "qsub {0} {1}".format(flags, script_name), additional_files=additional_files ) def assert_job_succeeded(self, job_id, children_number=0): # noqa: D102 @@ -243,19 +223,15 @@ def assert_job_submitted(self, sbatch_output): # noqa: D102 assert_that(match).is_not_none() return match.group(1) - def submit_interactive_command(self, command, nodes=1, host=None): # noqa: D102 - submission_command = "srun -N {0} --wrap='{1}'".format(nodes, command) - if host: - submission_command += " --nodelist={0}".format(host) - return self._remote_command_executor.run_remote_command(submission_command) - - def submit_command(self, command, nodes=1, host=None): # noqa: D102 + def submit_command(self, command, nodes=1, slots=None, host=None): # noqa: D102 submission_command = "sbatch -N {0} --wrap='{1}'".format(nodes, command) if host: submission_command += " --nodelist={0}".format(host) + if slots: + submission_command += " -n {0}".format(slots) return self._remote_command_executor.run_remote_command(submission_command) - def submit_script(self, script, nodes=1, host=None, additional_files=None): # noqa: D102 + def submit_script(self, script, nodes=1, slots=None, host=None, additional_files=None): # noqa: D102 if not additional_files: additional_files = [] additional_files.append(script) @@ -263,12 +239,16 @@ def submit_script(self, script, nodes=1, host=None, additional_files=None): # n submission_command = "sbatch" if host: submission_command += " --nodelist={0}".format(host) - submission_command += " -N {0} {1}".format(nodes, script_name) + if slots: + submission_command += " -n {0}".format(slots) + if nodes > 1: + submission_command += " -N {0}".format(slots) + submission_command += " {1}".format(nodes, script_name) return self._remote_command_executor.run_remote_command(submission_command, additional_files=additional_files) def assert_job_succeeded(self, job_id, children_number=0): # noqa: D102 result = self._remote_command_executor.run_remote_command("scontrol show jobs -o {0}".format(job_id)) - return "JobState=COMPLETED" in result.stdout + assert_that(result.stdout).contains("JobState=COMPLETED") def compute_nodes_count(self): # noqa: D102 result = self._remote_command_executor.run_remote_command("sinfo --Node --noheader | grep compute | wc -l") @@ -297,9 +277,6 @@ def get_job_exit_status(self, job_id): # noqa: D102 def assert_job_submitted(self, qsub_output): # noqa: D102 raise NotImplementedError - def submit_interactive_command(self, command, nodes=1): # noqa: D102 - raise NotImplementedError - def submit_command(self, command): # noqa: D102 raise NotImplementedError diff --git a/tests/integration-tests/tests/test_efa/test_efa.py b/tests/integration-tests/tests/test_efa/test_efa.py index 9ef0896b1c..ece4aaa519 100644 --- a/tests/integration-tests/tests/test_efa/test_efa.py +++ b/tests/integration-tests/tests/test_efa/test_efa.py @@ -17,67 +17,79 @@ from remote_command_executor import RemoteCommandExecutor from tests.common.schedulers_common import get_scheduler_commands +INSTANCES_TO_SLOTS_MAP = {"c5n.18xlarge": 72, "p3dn.24xlarge": 96, "i3en.24xlarge": 96} -@pytest.mark.regions(["us-east-1"]) + +@pytest.mark.regions(["us-east-1", "eu-west-1"]) @pytest.mark.instances(["c5n.18xlarge", "p3dn.24xlarge", "i3en.24xlarge"]) @pytest.mark.oss(["alinux", "centos7", "ubuntu1604"]) @pytest.mark.schedulers(["sge", "slurm"]) -@pytest.mark.usefixtures("os", "instance", "scheduler") -def test_efa(scheduler, pcluster_config_reader, clusters_factory, test_datadir): +@pytest.mark.usefixtures("os", "region") +def test_efa(scheduler, instance, pcluster_config_reader, clusters_factory, test_datadir): """ Test all EFA Features. Grouped all tests in a single function so that cluster can be reused for all of them. """ - scaledown_idletime = 3 - max_queue_size = 5 - cluster_config = pcluster_config_reader(scaledown_idletime=scaledown_idletime, max_queue_size=max_queue_size) + max_queue_size = 2 + slots_per_instance = INSTANCES_TO_SLOTS_MAP[instance] + cluster_config = pcluster_config_reader(max_queue_size=max_queue_size) cluster = clusters_factory(cluster_config) remote_command_executor = RemoteCommandExecutor(cluster) + scheduler_commands = get_scheduler_commands(scheduler, remote_command_executor) - _test_efa_installed(remote_command_executor) - _test_efa_mpi(remote_command_executor, scheduler, test_datadir) + _test_efa_installed(scheduler_commands, remote_command_executor) + _test_efa_mpi(remote_command_executor, scheduler_commands, test_datadir, slots_per_instance) + _test_osu_benchmarks(remote_command_executor, scheduler_commands, test_datadir, slots_per_instance) -def _test_efa_installed(remote_command_executor, scheduler): +def _test_efa_installed(scheduler_commands, remote_command_executor): # Output contains: # 00:06.0 Ethernet controller: Amazon.com, Inc. Device efa0 - logging.info("Testing EFA Installed") - scheduler_commands = get_scheduler_commands(scheduler, remote_command_executor) - result = scheduler_commands.submit_interactive_command("/sbin/lspci") + logging.info("Testing EFA installed") + result = scheduler_commands.submit_command("/sbin/lspci > /shared/lspci.out") + + job_id = scheduler_commands.assert_job_submitted(result.stdout) + scheduler_commands.wait_job_completed(job_id) + scheduler_commands.assert_job_succeeded(job_id) + + # Check EFA interface is present on compute node + result = remote_command_executor.run_remote_command("cat /shared/lspci.out") assert_that(result.stdout).contains("00:06.0 Ethernet controller: Amazon.com, Inc. Device efa0") + # Check EFA interface not present on master + result = remote_command_executor.run_remote_command("/sbin/lspci") + assert_that(result.stdout).does_not_contain("00:06.0 Ethernet controller: Amazon.com, Inc. Device efa0") -def _test_efa_mpi(remote_command_executor, scheduler, test_datadir): - logging.info("Testing EFA Installed") + +def _test_efa_mpi(remote_command_executor, scheduler_commands, test_datadir, slots_per_instance): + logging.info("Testing mpi job with EFA") # Compile mpi script - result = remote_command_executor.run_remote_command( + remote_command_executor.run_remote_command( "/opt/amazon/efa/bin/mpicc -o mpi_hello_world mpi_hello_world.c", additional_files=[str(test_datadir / "mpi_hello_world.c")], - ).stdout - logging.info(result) + ) # submit script using additional files - scheduler_commands = get_scheduler_commands(scheduler, remote_command_executor) - - result = scheduler_commands.submit_script(str(test_datadir / "{0}_submit.sh".format(scheduler))) + result = scheduler_commands.submit_script(str(test_datadir / "mpi_submit.sh"), slots=2 * slots_per_instance) job_id = scheduler_commands.assert_job_submitted(result.stdout) scheduler_commands.wait_job_completed(job_id) scheduler_commands.assert_job_succeeded(job_id) + mpi_out = remote_command_executor.run_remote_command("cat /shared/mpi.out").stdout + assert_that(mpi_out.splitlines()).is_length(2) + assert_that(mpi_out).matches(r"Hello world from processor ip-.+, rank 0 out of 2 processors") + assert_that(mpi_out).matches(r"Hello world from processor ip-.+, rank 1 out of 2 processors") -def _test_osu_benchmarks(remote_command_executor, scheduler, test_datadir): - logging.info("Testing EFA Installed") - # Compile mpi script - result = remote_command_executor.run_remote_command( - "/bin/bash osu_benchmarks.sh", additional_files=[str(test_datadir / "osu_benchmarks.sh")] - ).stdout - logging.info(result) - # submit script using additional files - scheduler_commands = get_scheduler_commands(scheduler, remote_command_executor) +def _test_osu_benchmarks(remote_command_executor, scheduler_commands, test_datadir, slots_per_instance): + logging.info("Running OSU benchmarks") + remote_command_executor.run_remote_script(str(test_datadir / "init_osu_benchmarks.sh"), hide=True) - result = scheduler_commands.submit_script(str(test_datadir / "{0}_submit_osu_benchmarks.sh".format(scheduler))) + result = scheduler_commands.submit_script(str(test_datadir / "osu_submit.sh"), slots=2 * slots_per_instance) job_id = scheduler_commands.assert_job_submitted(result.stdout) scheduler_commands.wait_job_completed(job_id) scheduler_commands.assert_job_succeeded(job_id) + + # TODO: perform assertions on benchmarks results + remote_command_executor.run_remote_command("cat /shared/osu.out") diff --git a/tests/integration-tests/tests/test_efa/test_efa/test_efa/osu_benchmarks.sh b/tests/integration-tests/tests/test_efa/test_efa/test_efa/init_osu_benchmarks.sh similarity index 86% rename from tests/integration-tests/tests/test_efa/test_efa/test_efa/osu_benchmarks.sh rename to tests/integration-tests/tests/test_efa/test_efa/test_efa/init_osu_benchmarks.sh index 8d6abb3293..1a6174813c 100644 --- a/tests/integration-tests/tests/test_efa/test_efa/test_efa/osu_benchmarks.sh +++ b/tests/integration-tests/tests/test_efa/test_efa/test_efa/init_osu_benchmarks.sh @@ -1,4 +1,5 @@ #!/usr/bin/env bash +set -e cd /shared wget http://mvapich.cse.ohio-state.edu/download/mvapich/osu-micro-benchmarks-5.4.tar.gz @@ -6,4 +7,4 @@ tar zxvf ./osu-micro-benchmarks-5.4.tar.gz cd osu-micro-benchmarks-5.4/ ./configure CC=/opt/amazon/efa/bin/mpicc CXX=/opt/amazon/efa/bin/mpicxx make -# make install in the submit script \ No newline at end of file +# make install in the submit script diff --git a/tests/integration-tests/tests/test_efa/test_efa/test_efa/mpi_hello_world.c b/tests/integration-tests/tests/test_efa/test_efa/test_efa/mpi_hello_world.c index 3a4dc33712..0f0b0252a2 100644 --- a/tests/integration-tests/tests/test_efa/test_efa/test_efa/mpi_hello_world.c +++ b/tests/integration-tests/tests/test_efa/test_efa/test_efa/mpi_hello_world.c @@ -32,4 +32,4 @@ int main(int argc, char** argv) { // Finalize the MPI environment. No more MPI calls can be made after this MPI_Finalize(); -} \ No newline at end of file +} diff --git a/tests/integration-tests/tests/test_efa/test_efa/test_efa/mpi_submit.sh b/tests/integration-tests/tests/test_efa/test_efa/test_efa/mpi_submit.sh new file mode 100644 index 0000000000..480b6a34ba --- /dev/null +++ b/tests/integration-tests/tests/test_efa/test_efa/test_efa/mpi_submit.sh @@ -0,0 +1,5 @@ +#!/bin/bash +set -e + +module load openmpi +mpirun -N 1 -np 2 "mpi_hello_world" &> /shared/mpi.out diff --git a/tests/integration-tests/tests/test_efa/test_efa/test_efa/osu_submit.sh b/tests/integration-tests/tests/test_efa/test_efa/test_efa/osu_submit.sh new file mode 100644 index 0000000000..7dab24000c --- /dev/null +++ b/tests/integration-tests/tests/test_efa/test_efa/test_efa/osu_submit.sh @@ -0,0 +1,5 @@ +#!/bin/bash +set -e + +module load openmpi +mpirun --map-by ppr:1:node /shared/osu-micro-benchmarks-5.4/mpi/pt2pt/osu_latency &> /shared/osu.out diff --git a/tests/integration-tests/tests/test_efa/test_efa/test_efa/pcluster.config.ini b/tests/integration-tests/tests/test_efa/test_efa/test_efa/pcluster.config.ini index 46047457b4..2722fee64d 100644 --- a/tests/integration-tests/tests/test_efa/test_efa/test_efa/pcluster.config.ini +++ b/tests/integration-tests/tests/test_efa/test_efa/test_efa/pcluster.config.ini @@ -9,10 +9,11 @@ base_os = {{ os }} key_name = {{ key_name }} vpc_settings = parallelcluster-vpc scheduler = {{ scheduler }} -master_instance_type = t2.micro +master_instance_type = c5.xlarge compute_instance_type = {{ instance }} initial_queue_size = 2 maintain_initial_size = true +max_queue_size = {{ max_queue_size }} enable_efa = compute placement_group = DYNAMIC diff --git a/tests/integration-tests/tests/test_efa/test_efa/test_efa/sge_osu_submit.sh b/tests/integration-tests/tests/test_efa/test_efa/test_efa/sge_osu_submit.sh deleted file mode 100644 index 17fecd2021..0000000000 --- a/tests/integration-tests/tests/test_efa/test_efa/test_efa/sge_osu_submit.sh +++ /dev/null @@ -1,9 +0,0 @@ -#!/bin/bash -#$ -pe mpi 144 - -cd /shared/osu-micro-benchmarks-5.4 -sudo make install # on all compute nodes - -# actually run the benchmark -module load openmpi -mpirun -N 1 -np 2 /usr/local/libexec/osu-micro-benchmarks/mpi/pt2pt/osu_latency \ No newline at end of file diff --git a/tests/integration-tests/tests/test_efa/test_efa/test_efa/sge_submit.sh b/tests/integration-tests/tests/test_efa/test_efa/test_efa/sge_submit.sh deleted file mode 100644 index e2b38c4c1a..0000000000 --- a/tests/integration-tests/tests/test_efa/test_efa/test_efa/sge_submit.sh +++ /dev/null @@ -1,5 +0,0 @@ -#!/bin/bash -#$ -pe mpi 144 - -module load openmpi -mpirun -N 1 -np 2 "mpi_hello_world" \ No newline at end of file From 9fcfd83b3f106ae87df9585c0e2b171915523bf3 Mon Sep 17 00:00:00 2001 From: Francesco De Martino Date: Thu, 6 Jun 2019 12:40:21 +0200 Subject: [PATCH 113/121] Update changelog for v2.4.0 Signed-off-by: Francesco De Martino --- CHANGELOG.rst | 59 ++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 58 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index f8b2677231..93483896f7 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -7,7 +7,64 @@ CHANGELOG **ENHANCEMENTS** -* Add support for Ubuntu in China region `cn-northwest-1` +* Add support for EFA on Centos 7, Amazon Linux and Ubuntu 1604 +* Add support for Ubuntu in China region ``cn-northwest-1`` + +* SGE: + + * process nodes added to or removed from the cluster in batches in order to speed up cluster scaling. + * scale up only if required slots/nodes can be satisfied + * scale down if pending jobs have unsatisfiable CPU/nodes requirements + * add support for jobs in hold/suspended state (this includes job dependencies) + * automatically terminate and replace faulty or unresponsive compute nodes + * add retries in case of failures when adding or removing nodes + * configure scheduler to handle rescheduling and cancellation of jobs running on failing or terminated nodes + +* Slurm: + + * scale up only if required slots/nodes can be satisfied + * scale down if pending jobs have unsatisfiable CPU/nodes requirements + * automatically terminate and replace faulty or unresponsive compute nodes + * decrease SlurmdTimeout to 120 seconds to speed up replacement of faulty nodes + +* Automatically replace compute instances that fail initialization and dump logs to shared home directory. +* Dynamically fetch compute instance type and cluster size in order to support updates in scaling daemons +* Always use full master FQDN when mounting NFS on compute nodes. This solves some issues occurring with some networking + setups and custom DNS configurations +* Set soft and hard ulimit on open files to 10000 for all supported OSs +* List the version and status during ``pcluster list`` +* Add option to color the output of ``pcluster list`` +* Remove double quoting of the post_install args +* ``awsbsub``: use override option to set the number of nodes rather than creating multiple JobDefinitions + +**CHANGES** + +* For a better security posture, we're removing AWS credentials from the ``parallelcluster`` config file + Credentials can be now setup following the canonical procedure used for the aws cli +* When using FSx or EFS do not enforce in sanity check that the compute security group is open to 0.0.0.0/0 +* When updating an existing cluster, the same template version is now used, no matter the pcluster cli version +* SQS messages that fail to be processed in ``sqswatcher`` are now re-queued only 3 times and not forever +* Reset ``nodewatcher`` idletime to 0 when the host becomes essential for the cluster (because of min size of ASG or + because there are pending jobs in the scheduler queue) +* SGE: a node is considered as busy when in one of the following states "u", "C", "s", "d", "D", "E", "P", "o". + This allows a quick replacement of the node without waiting for the ``nodewatcher`` to terminate it. +* Do not update DynamoDB table on cluster updates in order to avoid hitting strict API limits (1 update per day). + +**BUG FIXES** + +* Fix issue that was preventing Torque from being used on Centos 7 +* Start node daemons at the end of instance initialization. The time spent for post-install script and node + initialization is not counted as part of node idletime anymore. +* Fix issue which was causing an additional and invalid EBS mount point to be added in case of multiple EBS +* Install Slurm libpmpi/libpmpi2 that is distributed in a separate package since Slurm 17 +* ``pcluster ssh`` command now works for clusters with ``use_public_ips = false`` +* Slurm: add "BeginTime", "NodeDown", "Priority" and "ReqNodeNotAvail" to the pending reasons that trigger + a cluster scaling +* Add a timeout on remote commands execution so that the daemons are not stuck if the compute node is unresponsive +* Fix an edge case that was causing the ``nodewatcher`` to hang forever in case the node had become essential to the + cluster during a call to ``self_terminate``. +* Fix ``pcluster start/stop`` commands when used with an ``awsbatch`` cluster + 2.3.1 ===== From cc8737ed2c0aa28c3319d59c561f95ec299f0a67 Mon Sep 17 00:00:00 2001 From: Sean Smith Date: Thu, 6 Jun 2019 17:26:44 -0700 Subject: [PATCH 114/121] Allow re-use of Cluster in Tests * `--vpc-cluster` allows re-use of the vpc cluster between tests * `--cluster` allows re-use of the cluster between tests * `--no-delete` makes it easy to not delete created cluster, and re-use them Signed-off-by: Sean Smith --- tests/integration-tests/README.md | 27 ++++++++++++ tests/integration-tests/cfn_stacks_factory.py | 2 +- tests/integration-tests/clusters_factory.py | 4 +- tests/integration-tests/conftest.py | 41 ++++++++++++------- tests/integration-tests/test_runner.py | 25 +++++++++++ 5 files changed, 82 insertions(+), 17 deletions(-) diff --git a/tests/integration-tests/README.md b/tests/integration-tests/README.md index d1f7db70bd..432125b1c6 100644 --- a/tests/integration-tests/README.md +++ b/tests/integration-tests/README.md @@ -142,6 +142,33 @@ The configuration for the custom templates and packages are automatically inject all cluster configs when these are rendered. In case any of these parameters is already set in the cluster config then the value in the config is used. +### Re-use clusters and vpc clusters + +When developing integration tests, it can be helpful to re-use a cluster between tests. +This is easily accomplished with the use of the `--vpc-stack` and `--cluster` flags. + +If you're starting from scratch, run the test with the `--no-delete` flag. +This preserves any stacks created for the test: + +```bash +python -m test_runner \ + ... + --no-delete +``` + +Then when you have a vpc stack and cluster, reference them when starting a test: + +```bash +python -m test_runner \ + ... + --vpc-stack "integ-tests-vpc-ncw7zrccsau8uh6k" + --cluster "efa-demo" + --no-delete +``` + +Keep in mind, the cluster you pass can have different `scheduler`, `os` or other features +than what is specified in the test. This can break the tests in unexpected ways. Be mindful. + ## Write Integration Tests All integration tests are defined in the `integration-tests/tests` directory. diff --git a/tests/integration-tests/cfn_stacks_factory.py b/tests/integration-tests/cfn_stacks_factory.py index 25d8fb40c2..61ce8e2787 100644 --- a/tests/integration-tests/cfn_stacks_factory.py +++ b/tests/integration-tests/cfn_stacks_factory.py @@ -20,7 +20,7 @@ class CfnStack: """Identify a CloudFormation stack.""" - def __init__(self, name, region, template): + def __init__(self, name, region, template=None): self.name = name self.region = region self.template = template diff --git a/tests/integration-tests/clusters_factory.py b/tests/integration-tests/clusters_factory.py index 46bf917312..e53503eed1 100644 --- a/tests/integration-tests/clusters_factory.py +++ b/tests/integration-tests/clusters_factory.py @@ -21,7 +21,7 @@ class Cluster: """Contain all static and dynamic data related to a cluster instance.""" - def __init__(self, name, config_file, ssh_key): + def __init__(self, name, ssh_key, config_file=None): self.name = name self.config_file = config_file self.ssh_key = ssh_key @@ -121,7 +121,7 @@ def create_cluster(self, cluster): # create the cluster logging.info("Creating cluster {0} with config {1}".format(name, config)) self.__created_clusters[name] = cluster - result = run_command(["pcluster", "create", "--config", config, name]) + result = run_command(["pcluster", "create", "--no-rollback", "--config", config, name]) if "Status: {0} - CREATE_COMPLETE".format(cluster.cfn_name) not in result.stdout: error = "Cluster creation failed for {0} with output: {1}".format(name, result.stdout) logging.error(error) diff --git a/tests/integration-tests/conftest.py b/tests/integration-tests/conftest.py index 192db95f4f..c2b8e6f66a 100644 --- a/tests/integration-tests/conftest.py +++ b/tests/integration-tests/conftest.py @@ -56,6 +56,11 @@ def pytest_addoption(parser): parser.addoption("--custom-awsbatchcli-package", help="url to a custom awsbatch cli package") parser.addoption("--custom-node-package", help="url to a custom node package") parser.addoption("--custom-ami", help="custom AMI to use in the tests") + parser.addoption("--vpc-stack", help="Name of an existing vpc stack.") + parser.addoption("--cluster", help="Use an existing cluster instead of creating one.") + parser.addoption( + "--no-delete", action="store_true", default=False, help="Don't delete stacks after tests are complete." + ) def pytest_generate_tests(metafunc): @@ -161,17 +166,21 @@ def clusters_factory(request): factory = ClustersFactory() def _cluster_factory(cluster_config): - cluster_config = _write_cluster_config_to_outdir(request, cluster_config) - cluster = Cluster( - name="integ-tests-" + random_alphanumeric(), - config_file=cluster_config, - ssh_key=request.config.getoption("key_path"), - ) - factory.create_cluster(cluster) + if request.config.getoption("cluster"): + cluster = Cluster(name=request.config.getoption("cluster"), ssh_key=request.config.getoption("key_path")) + else: + cluster_config = _write_cluster_config_to_outdir(request, cluster_config) + cluster = Cluster( + name="integ-tests-" + random_alphanumeric(), + config_file=cluster_config, + ssh_key=request.config.getoption("key_path"), + ) + factory.create_cluster(cluster) return cluster yield _cluster_factory - factory.destroy_all_clusters() + if not request.config.getoption("no_delete"): + factory.destroy_all_clusters() def _write_cluster_config_to_outdir(request, cluster_config): @@ -267,11 +276,12 @@ def _get_default_template_values(vpc_stacks, region, request): @pytest.fixture(scope="session") -def cfn_stacks_factory(): +def cfn_stacks_factory(request): """Define a fixture to manage the creation and destruction of CloudFormation stacks.""" factory = CfnStacksFactory() yield factory - factory.delete_all_stacks() + if not request.config.getoption("no_delete"): + factory.delete_all_stacks() # FIXME: we need to find a better solution to this since AZs are independently mapped to names for each AWS account. @@ -318,7 +328,7 @@ def vpc_stacks(cfn_stacks_factory, request): ) vpc_config = VPCConfig(subnets=[public_subnet, private_subnet]) template = VPCTemplateBuilder(vpc_config).build() - vpc_stacks[region] = _create_vpc_stack(template, region, cfn_stacks_factory) + vpc_stacks[region] = _create_vpc_stack(request, template, region, cfn_stacks_factory) return vpc_stacks @@ -326,9 +336,12 @@ def vpc_stacks(cfn_stacks_factory, request): # If stack creation fails it'll retry once more. This is done to mitigate failures due to resources # not available in randomly picked AZs. @retry(stop_max_attempt_number=2, wait_fixed=5000) -def _create_vpc_stack(template, region, cfn_stacks_factory): - stack = CfnStack(name="integ-tests-vpc-" + random_alphanumeric(), region=region, template=template.to_json()) - cfn_stacks_factory.create_stack(stack) +def _create_vpc_stack(request, template, region, cfn_stacks_factory): + if request.config.getoption("vpc_stack"): + stack = CfnStack(name=request.config.getoption("vpc_stack"), region=region) + else: + stack = CfnStack(name="integ-tests-vpc-" + random_alphanumeric(), region=region, template=template.to_json()) + cfn_stacks_factory.create_stack(stack) return stack diff --git a/tests/integration-tests/test_runner.py b/tests/integration-tests/test_runner.py index 1023073ac3..bf353dbfc7 100644 --- a/tests/integration-tests/test_runner.py +++ b/tests/integration-tests/test_runner.py @@ -63,6 +63,9 @@ "custom_awsbatch_template_url": None, "custom_awsbatchcli_url": None, "custom_ami": None, + "vpc_stack": None, + "cluster": None, + "no_delete": False, } @@ -152,6 +155,16 @@ def _init_argparser(): parser.add_argument( "--custom-ami", help="custom AMI to use for all tests.", default=TEST_DEFAULTS.get("custom_ami") ) + parser.add_argument("--vpc-stack", help="Name of an existing vpc stack.", default=TEST_DEFAULTS.get("vpc_stack")) + parser.add_argument( + "--cluster", help="Use an existing cluster instead of creating one.", default=TEST_DEFAULTS.get("cluster") + ) + parser.add_argument( + "--no-delete", + action="store_true", + help="Don't delete stacks after tests are complete.", + default=TEST_DEFAULTS.get("no_delete"), + ) return parser @@ -199,6 +212,7 @@ def _get_pytest_args(args, regions, log_file, out_dir): pytest_args.append("--html={0}/{1}/results.html".format(args.output_dir, out_dir)) _set_custom_packages_args(args, pytest_args) + _set_custom_stack_args(args, pytest_args) return pytest_args @@ -223,6 +237,17 @@ def _set_custom_packages_args(args, pytest_args): pytest_args.extend(["--custom-ami", args.custom_ami]) +def _set_custom_stack_args(args, pytest_args): + if args.vpc_stack: + pytest_args.extend(["--vpc-stack", args.vpc_stack]) + + if args.cluster: + pytest_args.extend(["--cluster", args.cluster]) + + if args.no_delete: + pytest_args.append("--no-delete") + + def _get_pytest_regionalized_args(region, args): return _get_pytest_args( args=args, From 64609aab127f91d1264eaf042a8beac7f1ce3e20 Mon Sep 17 00:00:00 2001 From: Francesco De Martino Date: Fri, 7 Jun 2019 19:26:30 +0200 Subject: [PATCH 115/121] Rename AWS_PARALLELCLUSTER_CONFIG_FILE env var to AWS_PCLUSTER_CONFIG_FILE Signed-off-by: Francesco De Martino --- cli/pcluster/cfnconfig.py | 4 ++-- docs/configuration.rst | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/cli/pcluster/cfnconfig.py b/cli/pcluster/cfnconfig.py index 2dc62accd2..489cb4d3d2 100644 --- a/cli/pcluster/cfnconfig.py +++ b/cli/pcluster/cfnconfig.py @@ -135,8 +135,8 @@ def __init_config(self): if hasattr(self.args, "config_file") and self.args.config_file is not None: config_file = self.args.config_file default_config = False - elif "AWS_PARALLELCLUSTER_CONFIG_FILE" in os.environ: - config_file = os.environ["AWS_PARALLELCLUSTER_CONFIG_FILE"] + elif "AWS_PCLUSTER_CONFIG_FILE" in os.environ: + config_file = os.environ["AWS_PCLUSTER_CONFIG_FILE"] default_config = False else: config_file = os.path.expanduser(os.path.join("~", ".parallelcluster", "config")) diff --git a/docs/configuration.rst b/docs/configuration.rst index fb147437ae..3ed1f24f34 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -4,7 +4,7 @@ Configuration ParallelCluster uses the file ``~/.parallelcluster/config`` by default for all configuration parameters. You can change the location of the config file via the ``--config`` command option or by setting the -AWS_PARALLELCLUSTER_CONFIG_FILE environment variable. +``AWS_PCLUSTER_CONFIG_FILE`` environment variable. An example configuration file can be found at ``site-packages/aws-parallelcluster/examples/config``. From ea9f0404b78cd25fd1df01488242f2a9dedf8648 Mon Sep 17 00:00:00 2001 From: Sean Smith Date: Fri, 7 Jun 2019 15:57:19 -0700 Subject: [PATCH 116/121] Add FSx tag Signed-off-by: Sean Smith --- cloudformation/aws-parallelcluster.cfn.json | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/cloudformation/aws-parallelcluster.cfn.json b/cloudformation/aws-parallelcluster.cfn.json index abb1fe6e27..f5692344ef 100644 --- a/cloudformation/aws-parallelcluster.cfn.json +++ b/cloudformation/aws-parallelcluster.cfn.json @@ -2081,7 +2081,7 @@ "Key": "aws-parallelcluster-filesystem", "Value": { "Fn::Sub": [ - "efs=${efs}, multiebs=${NumberOfEBSVol}, raid=${raid}", + "efs=${efs}, multiebs=${NumberOfEBSVol}, raid=${raid}, fsx=${fsx}", { "efs": { "Fn::If": [ @@ -2096,6 +2096,13 @@ "1", "0" ] + }, + "fsx": { + "Fn::If": [ + "CreateFSXSubstack", + "1", + "0" + ] } } ] From f77d22330ca819bd687fa739b45a04bf05c5e17d Mon Sep 17 00:00:00 2001 From: Sean Smith Date: Fri, 7 Jun 2019 14:38:46 -0700 Subject: [PATCH 117/121] EFA Test Fixes Signed-off-by: Sean Smith --- tests/integration-tests/cfn_stacks_factory.py | 2 +- tests/integration-tests/clusters_factory.py | 4 ++-- tests/integration-tests/conftest.py | 21 ++++++++++--------- .../tests/test_efa/test_efa.py | 12 ++++++----- .../test_efa/test_efa/init_osu_benchmarks.sh | 3 +-- .../test_efa/test_efa/test_efa/mpi_submit.sh | 2 +- .../test_efa/test_efa/test_efa/osu_submit.sh | 2 +- 7 files changed, 24 insertions(+), 22 deletions(-) diff --git a/tests/integration-tests/cfn_stacks_factory.py b/tests/integration-tests/cfn_stacks_factory.py index 61ce8e2787..25d8fb40c2 100644 --- a/tests/integration-tests/cfn_stacks_factory.py +++ b/tests/integration-tests/cfn_stacks_factory.py @@ -20,7 +20,7 @@ class CfnStack: """Identify a CloudFormation stack.""" - def __init__(self, name, region, template=None): + def __init__(self, name, region, template): self.name = name self.region = region self.template = template diff --git a/tests/integration-tests/clusters_factory.py b/tests/integration-tests/clusters_factory.py index e53503eed1..cfe233728c 100644 --- a/tests/integration-tests/clusters_factory.py +++ b/tests/integration-tests/clusters_factory.py @@ -21,7 +21,7 @@ class Cluster: """Contain all static and dynamic data related to a cluster instance.""" - def __init__(self, name, ssh_key, config_file=None): + def __init__(self, name, ssh_key, config_file): self.name = name self.config_file = config_file self.ssh_key = ssh_key @@ -121,7 +121,7 @@ def create_cluster(self, cluster): # create the cluster logging.info("Creating cluster {0} with config {1}".format(name, config)) self.__created_clusters[name] = cluster - result = run_command(["pcluster", "create", "--no-rollback", "--config", config, name]) + result = run_command(["pcluster", "create", "--norollback", "--config", config, name]) if "Status: {0} - CREATE_COMPLETE".format(cluster.cfn_name) not in result.stdout: error = "Cluster creation failed for {0} with output: {1}".format(name, result.stdout) logging.error(error) diff --git a/tests/integration-tests/conftest.py b/tests/integration-tests/conftest.py index c2b8e6f66a..0589ca35ea 100644 --- a/tests/integration-tests/conftest.py +++ b/tests/integration-tests/conftest.py @@ -166,15 +166,15 @@ def clusters_factory(request): factory = ClustersFactory() def _cluster_factory(cluster_config): - if request.config.getoption("cluster"): - cluster = Cluster(name=request.config.getoption("cluster"), ssh_key=request.config.getoption("key_path")) - else: - cluster_config = _write_cluster_config_to_outdir(request, cluster_config) - cluster = Cluster( - name="integ-tests-" + random_alphanumeric(), - config_file=cluster_config, - ssh_key=request.config.getoption("key_path"), - ) + cluster_config = _write_cluster_config_to_outdir(request, cluster_config) + cluster = Cluster( + name=request.config.getoption("cluster") + if request.config.getoption("cluster") + else "integ-tests-" + random_alphanumeric(), + config_file=cluster_config, + ssh_key=request.config.getoption("key_path"), + ) + if not request.config.getoption("cluster"): factory.create_cluster(cluster) return cluster @@ -338,7 +338,8 @@ def vpc_stacks(cfn_stacks_factory, request): @retry(stop_max_attempt_number=2, wait_fixed=5000) def _create_vpc_stack(request, template, region, cfn_stacks_factory): if request.config.getoption("vpc_stack"): - stack = CfnStack(name=request.config.getoption("vpc_stack"), region=region) + logging.info("Using stack {0} in region {1}".format(request.config.getoption("vpc_stack"), region)) + stack = CfnStack(name=request.config.getoption("vpc_stack"), region=region, template=template.to_json()) else: stack = CfnStack(name="integ-tests-vpc-" + random_alphanumeric(), region=region, template=template.to_json()) cfn_stacks_factory.create_stack(stack) diff --git a/tests/integration-tests/tests/test_efa/test_efa.py b/tests/integration-tests/tests/test_efa/test_efa.py index ece4aaa519..6fa0e071cb 100644 --- a/tests/integration-tests/tests/test_efa/test_efa.py +++ b/tests/integration-tests/tests/test_efa/test_efa.py @@ -10,6 +10,7 @@ # This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied. # See the License for the specific language governing permissions and limitations under the License. import logging +import re import pytest @@ -20,7 +21,7 @@ INSTANCES_TO_SLOTS_MAP = {"c5n.18xlarge": 72, "p3dn.24xlarge": 96, "i3en.24xlarge": 96} -@pytest.mark.regions(["us-east-1", "eu-west-1"]) +@pytest.mark.regions(["us-east-1"]) @pytest.mark.instances(["c5n.18xlarge", "p3dn.24xlarge", "i3en.24xlarge"]) @pytest.mark.oss(["alinux", "centos7", "ubuntu1604"]) @pytest.mark.schedulers(["sge", "slurm"]) @@ -47,7 +48,7 @@ def _test_efa_installed(scheduler_commands, remote_command_executor): # Output contains: # 00:06.0 Ethernet controller: Amazon.com, Inc. Device efa0 logging.info("Testing EFA installed") - result = scheduler_commands.submit_command("/sbin/lspci > /shared/lspci.out") + result = scheduler_commands.submit_command("lspci > /shared/lspci.out") job_id = scheduler_commands.assert_job_submitted(result.stdout) scheduler_commands.wait_job_completed(job_id) @@ -58,7 +59,7 @@ def _test_efa_installed(scheduler_commands, remote_command_executor): assert_that(result.stdout).contains("00:06.0 Ethernet controller: Amazon.com, Inc. Device efa0") # Check EFA interface not present on master - result = remote_command_executor.run_remote_command("/sbin/lspci") + result = remote_command_executor.run_remote_command("lspci") assert_that(result.stdout).does_not_contain("00:06.0 Ethernet controller: Amazon.com, Inc. Device efa0") @@ -91,5 +92,6 @@ def _test_osu_benchmarks(remote_command_executor, scheduler_commands, test_datad scheduler_commands.wait_job_completed(job_id) scheduler_commands.assert_job_succeeded(job_id) - # TODO: perform assertions on benchmarks results - remote_command_executor.run_remote_command("cat /shared/osu.out") + output = remote_command_executor.run_remote_command("cat /shared/osu.out").stdout + latency = re.search(r"0\s+(\d\d)\.", output).group(1) + assert_that(int(latency)).is_less_than(20) diff --git a/tests/integration-tests/tests/test_efa/test_efa/test_efa/init_osu_benchmarks.sh b/tests/integration-tests/tests/test_efa/test_efa/test_efa/init_osu_benchmarks.sh index 1a6174813c..cc912c9dc8 100644 --- a/tests/integration-tests/tests/test_efa/test_efa/test_efa/init_osu_benchmarks.sh +++ b/tests/integration-tests/tests/test_efa/test_efa/test_efa/init_osu_benchmarks.sh @@ -6,5 +6,4 @@ wget http://mvapich.cse.ohio-state.edu/download/mvapich/osu-micro-benchmarks-5.4 tar zxvf ./osu-micro-benchmarks-5.4.tar.gz cd osu-micro-benchmarks-5.4/ ./configure CC=/opt/amazon/efa/bin/mpicc CXX=/opt/amazon/efa/bin/mpicxx -make -# make install in the submit script +make \ No newline at end of file diff --git a/tests/integration-tests/tests/test_efa/test_efa/test_efa/mpi_submit.sh b/tests/integration-tests/tests/test_efa/test_efa/test_efa/mpi_submit.sh index 480b6a34ba..1f48445e1e 100644 --- a/tests/integration-tests/tests/test_efa/test_efa/test_efa/mpi_submit.sh +++ b/tests/integration-tests/tests/test_efa/test_efa/test_efa/mpi_submit.sh @@ -2,4 +2,4 @@ set -e module load openmpi -mpirun -N 1 -np 2 "mpi_hello_world" &> /shared/mpi.out +mpirun -N 1 -np 2 "mpi_hello_world" >> /shared/mpi.out diff --git a/tests/integration-tests/tests/test_efa/test_efa/test_efa/osu_submit.sh b/tests/integration-tests/tests/test_efa/test_efa/test_efa/osu_submit.sh index 7dab24000c..39394ea744 100644 --- a/tests/integration-tests/tests/test_efa/test_efa/test_efa/osu_submit.sh +++ b/tests/integration-tests/tests/test_efa/test_efa/test_efa/osu_submit.sh @@ -2,4 +2,4 @@ set -e module load openmpi -mpirun --map-by ppr:1:node /shared/osu-micro-benchmarks-5.4/mpi/pt2pt/osu_latency &> /shared/osu.out +mpirun --map-by ppr:1:node /shared/osu-micro-benchmarks-5.4/mpi/pt2pt/osu_latency >> /shared/osu.out From 8e166f0f8fce5d7fe2972f1954323b8144c3d50c Mon Sep 17 00:00:00 2001 From: Francesco De Martino Date: Mon, 10 Jun 2019 19:58:13 +0200 Subject: [PATCH 118/121] Update CHANGELOG Signed-off-by: Francesco De Martino --- CHANGELOG.rst | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 93483896f7..8d99858b53 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -31,14 +31,16 @@ CHANGELOG * Dynamically fetch compute instance type and cluster size in order to support updates in scaling daemons * Always use full master FQDN when mounting NFS on compute nodes. This solves some issues occurring with some networking setups and custom DNS configurations -* Set soft and hard ulimit on open files to 10000 for all supported OSs * List the version and status during ``pcluster list`` -* Add option to color the output of ``pcluster list`` * Remove double quoting of the post_install args * ``awsbsub``: use override option to set the number of nodes rather than creating multiple JobDefinitions +* Add support for AWS_PCLUSTER_CONFIG_FILE env variable to specify pcluster config file **CHANGES** +* Update openmpi library to version 3.1.4 on Centos 7, Amazon Linux and Ubuntu 1604. This also changes the default + openmpi path to ``/opt/amazon/efa/bin/`` and the openmpi module name to ``openmpi/3.1.4`` +* Set soft and hard ulimit on open files to 10000 for all supported OSs * For a better security posture, we're removing AWS credentials from the ``parallelcluster`` config file Credentials can be now setup following the canonical procedure used for the aws cli * When using FSx or EFS do not enforce in sanity check that the compute security group is open to 0.0.0.0/0 From fea9e47ad5e02084194958001682d0d65ef2ac5e Mon Sep 17 00:00:00 2001 From: Francesco De Martino Date: Mon, 10 Jun 2019 20:14:30 +0200 Subject: [PATCH 119/121] integ tests: add skip_instances in scaling tests Signed-off-by: Francesco De Martino --- tests/integration-tests/tests/test_scaling.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/integration-tests/tests/test_scaling.py b/tests/integration-tests/tests/test_scaling.py index 4cd09d62bf..3761ebcb3b 100644 --- a/tests/integration-tests/tests/test_scaling.py +++ b/tests/integration-tests/tests/test_scaling.py @@ -24,6 +24,7 @@ @pytest.mark.skip_schedulers(["awsbatch"]) +@pytest.mark.skip_instances(["c5n.18xlarge", "p3dn.24xlarge", "i3en.24xlarge"]) @pytest.mark.usefixtures("region", "os", "instance") def test_multiple_jobs_submission(scheduler, region, pcluster_config_reader, clusters_factory, test_datadir): scaledown_idletime = 4 From 8a23e6a220c657c448107ed9fe66811e06db5cd7 Mon Sep 17 00:00:00 2001 From: ParallelCluster AMI bot Date: Tue, 11 Jun 2019 12:14:32 +0000 Subject: [PATCH 120/121] Update AMI List Build Number 155 aws-parallelcluster-cookbook Git hash: 4afb7d672118aa192dd546b18ee4fcc4452e978d aws-parallelcluster-node Git hash: 9db8e2ec25e2fc2314771cee92ebb29b78bdca0f Signed-off-by: ParallelCluster AMI bot --- amis.txt | 192 ++++++++++---------- cloudformation/aws-parallelcluster.cfn.json | 192 ++++++++++---------- 2 files changed, 194 insertions(+), 190 deletions(-) diff --git a/amis.txt b/amis.txt index d9ebd68032..7dfd7a8dbb 100644 --- a/amis.txt +++ b/amis.txt @@ -1,100 +1,102 @@ # alinux -ap-northeast-1: ami-072ac3e57034c3bb9 -ap-northeast-2: ami-080aecd84cc7434f0 -ap-northeast-3: ami-0b0ebcca7a6d3cc89 -ap-south-1: ami-0357da922b040bdb4 -ap-southeast-1: ami-0fa2ec4e5f0402d8d -ap-southeast-2: ami-00bcdbe79f9f9360f -ca-central-1: ami-06f86504395727a59 -cn-north-1: ami-0c7128d1d555a95f3 -cn-northwest-1: ami-0b838ce1114165ca1 -eu-central-1: ami-00917e29a34aa32ee -eu-north-1: ami-045b3be6347ece0fb -eu-west-1: ami-0b05e2f00ebee148c -eu-west-2: ami-0722446a78ea175f8 -eu-west-3: ami-0c54390b66173bd7d -sa-east-1: ami-0a47ff8d07b75a4e9 -us-east-1: ami-0d8d467ddca09f9ea -us-east-2: ami-0c8fd29db55b1fd99 -us-gov-east-1: ami-0646415808673ae07 -us-gov-west-1: ami-3b83f05a -us-west-1: ami-06258a4b05a6eb611 -us-west-2: ami-020c1ec32ad429b44 +ap-northeast-1: ami-0dcc18768374b4441 +ap-northeast-2: ami-022e7c66ccb807c9f +ap-northeast-3: ami-04402be7b85999df8 +ap-south-1: ami-0a14b1f0e7427a4bb +ap-southeast-1: ami-02079735c20c1ac4e +ap-southeast-2: ami-0c65952cdec26ae39 +ca-central-1: ami-01f28f8381746746f +cn-north-1: ami-0da67c26ce2e8d111 +cn-northwest-1: ami-03dc8f759de9de690 +eu-central-1: ami-0ff6d2a86b9199e82 +eu-north-1: ami-0cb08caa10d113ed7 +eu-west-1: ami-0b5c32b12b9c340d0 +eu-west-2: ami-0c218c2aaa7185f03 +eu-west-3: ami-011e0eee21d52f23e +sa-east-1: ami-0d154ae55458941fd +us-east-1: ami-0d130bdfab2037f8a +us-east-2: ami-00d2a10466c577ac7 +us-gov-east-1: ami-0f5003922daf22962 +us-gov-west-1: ami-ba83fbdb +us-west-1: ami-0b6f7961ee845966e +us-west-2: ami-0d611d90619419e93 # centos6 -ap-northeast-1: ami-0f5ea785473ed7ab5 -ap-northeast-2: ami-07c9635710bc567e6 -ap-northeast-3: ami-0b65ed5f7a0ff726a -ap-south-1: ami-0e18648a2d5b5e83c -ap-southeast-1: ami-0f784e8d9290e0f50 -ap-southeast-2: ami-0e5559991f7845038 -ca-central-1: ami-0ba632dba5f588a43 -eu-central-1: ami-04771fcea0287def6 -eu-north-1: ami-0adf183b1beb95c23 -eu-west-1: ami-0c965c8bf7c698110 -eu-west-2: ami-032a7702e546c7e42 -eu-west-3: ami-0b7b602a10f83961c -sa-east-1: ami-03309e4b0f7ef6ea5 -us-east-1: ami-0e2b588a37264f1cb -us-east-2: ami-09461a00d94eb74e5 -us-west-1: ami-04d12cccd51b29965 -us-west-2: ami-0e2efd8528c9ddb07 +ap-northeast-1: ami-086781b933db101a5 +ap-northeast-2: ami-07d646c87d889d816 +ap-northeast-3: ami-082ece6e5fe8f6fd1 +ap-south-1: ami-02389426198baf430 +ap-southeast-1: ami-02105387481bd0ad0 +ap-southeast-2: ami-0050fad9761b3957c +ca-central-1: ami-0e70755a47200df23 +eu-central-1: ami-03979ebb9cfee2ccc +eu-north-1: ami-085a9ecbf9f64f65b +eu-west-1: ami-070ba56e38a744df5 +eu-west-2: ami-08553013e6e986028 +eu-west-3: ami-0afff5bc147c847e0 +sa-east-1: ami-0635a9bdc378fe67f +us-east-1: ami-091f37e900368fe1a +us-east-2: ami-055404b3df678da86 +us-west-1: ami-0e438402399c457d7 +us-west-2: ami-0651b7e7cfde4b3a0 # centos7 -ap-northeast-1: ami-09b175846250d37e0 -ap-northeast-2: ami-05a371ea785419539 -ap-northeast-3: ami-056bfe9c042992212 -ap-south-1: ami-0cf72d39ece5fe7b8 -ap-southeast-1: ami-08e0a6e0d5e22df29 -ap-southeast-2: ami-04b6949a410a967ed -ca-central-1: ami-01cd6f223f4ca39af -eu-central-1: ami-00b98e76cefa6f7b8 -eu-north-1: ami-03bba2a04c7eee896 -eu-west-1: ami-0d7719633ec6daed8 -eu-west-2: ami-05bcb384a4c8b0c5b -eu-west-3: ami-054d53deecd76ad61 -sa-east-1: ami-089367ecc959c8bc4 -us-east-1: ami-0f942ac10a338af45 -us-east-2: ami-0edbaef69d274e64e -us-west-1: ami-036c33c35fa178b86 -us-west-2: ami-0bbd399489108361e +ap-northeast-1: ami-09bae677f8f58842d +ap-northeast-2: ami-0eeb6c96d0e6c2d90 +ap-northeast-3: ami-084c0dbc04f722758 +ap-south-1: ami-031f8f67a53de53fe +ap-southeast-1: ami-041ca5c2f5b748966 +ap-southeast-2: ami-06c7f5584ecfcac3a +ca-central-1: ami-0afc2ea67b3963398 +eu-central-1: ami-0205eaef48a9fc97a +eu-north-1: ami-0420576e18a5fcb7c +eu-west-1: ami-0f67868de5be7b0b3 +eu-west-2: ami-057fa1a5314e3c414 +eu-west-3: ami-05b2808c2dc4fb82c +sa-east-1: ami-0da1262e3c5d9af72 +us-east-1: ami-031eb9c5390c0f8f6 +us-east-2: ami-0050bd80a1cecfe37 +us-west-1: ami-09bd008b253048b80 +us-west-2: ami-003da28849bc413f5 # ubuntu1404 -ap-northeast-1: ami-0348c76e890f528a1 -ap-northeast-2: ami-0c4b80f78e7f62047 -ap-northeast-3: ami-03aaa1e1ae3007767 -ap-south-1: ami-04317bb153d6a8fff -ap-southeast-1: ami-000ce989867413e4e -ap-southeast-2: ami-09a0b2d2bdc669a6c -ca-central-1: ami-0555c4a7c47384856 -cn-north-1: ami-0839700f2d6edb4e9 -eu-central-1: ami-0d35dcfebb277a64d -eu-north-1: ami-037dc7fd57f8c3151 -eu-west-1: ami-0c1b4a36f6da56db9 -eu-west-2: ami-08b285fd75a7a483b -eu-west-3: ami-0653aa7d98f3033ec -sa-east-1: ami-0024bc42c6c9cc832 -us-east-1: ami-00144db1528d8942d -us-east-2: ami-0d7393fe329fac685 -us-gov-east-1: ami-06bcf13af6f8352b2 -us-gov-west-1: ami-a89deec9 -us-west-1: ami-0e9ab67cbeffe41b6 -us-west-2: ami-056be711a3a572d20 +ap-northeast-1: ami-0939e3e1030d4f7d2 +ap-northeast-2: ami-0481c6b023e2328b4 +ap-northeast-3: ami-0a535e1d0bb7bc502 +ap-south-1: ami-000e99acc047832ae +ap-southeast-1: ami-09ca9a6a8fee71ba5 +ap-southeast-2: ami-09646cc49a932a37e +ca-central-1: ami-06ac5db73837bc364 +cn-north-1: ami-07e16a5709c99f963 +cn-northwest-1: ami-05348579489ba3673 +eu-central-1: ami-0032889c720d364dc +eu-north-1: ami-0976908358f0bfa01 +eu-west-1: ami-0f5c65a609ad3afb4 +eu-west-2: ami-08c2d96c2805037e7 +eu-west-3: ami-0f6cd6ac9be8f2b32 +sa-east-1: ami-0d0da341da4802af9 +us-east-1: ami-017bfe181606779d8 +us-east-2: ami-043eb896e1bb2b948 +us-gov-east-1: ami-060ced48ab370aadf +us-gov-west-1: ami-32f98153 +us-west-1: ami-0d48f8a9d5735efde +us-west-2: ami-0169da6ccb6347f50 # ubuntu1604 -ap-northeast-1: ami-0446db0488549a643 -ap-northeast-2: ami-0b0bdd98877e4e506 -ap-northeast-3: ami-088f0307cdd06a985 -ap-south-1: ami-066e4cfcc460304e0 -ap-southeast-1: ami-0597821ac99a481e3 -ap-southeast-2: ami-03c9d6bdf2a2cba07 -ca-central-1: ami-0e4208d9ab789da40 -cn-north-1: ami-0e77f8ab0c91d2b9c -eu-central-1: ami-0457c581edab1213c -eu-north-1: ami-03f64544dad6ae6be -eu-west-1: ami-0aee65b23ac6121e5 -eu-west-2: ami-001486f35ffff87d9 -eu-west-3: ami-05641d46059f19f8f -sa-east-1: ami-00451f8f5429960a6 -us-east-1: ami-02b5b8dab90e057c4 -us-east-2: ami-05f87726bd2802a79 -us-gov-east-1: ami-033192481089d3c64 -us-gov-west-1: ami-f3ec9f92 -us-west-1: ami-03ff95bfd61b9cb5e -us-west-2: ami-0d6d6a4cbeb63a57d +ap-northeast-1: ami-06b328a6ee03ccdf4 +ap-northeast-2: ami-0179e2707f709f813 +ap-northeast-3: ami-0c9b72bae5efc9f61 +ap-south-1: ami-0f21d1eb3339ebd6a +ap-southeast-1: ami-01899e9a659eb2267 +ap-southeast-2: ami-049c81a79d55b2c8a +ca-central-1: ami-0b8928a1f643684eb +cn-north-1: ami-0ae967dc97d5eb57a +cn-northwest-1: ami-0ba0b1ed49ce7b1b1 +eu-central-1: ami-002422c65a5bb1af8 +eu-north-1: ami-0d3c7ce730c73ab00 +eu-west-1: ami-00328873639859269 +eu-west-2: ami-0c1de72c6acf4b187 +eu-west-3: ami-090d577bb6d08e95b +sa-east-1: ami-08df8912b098a3f42 +us-east-1: ami-08e1d33a6a64499de +us-east-2: ami-0219fdb6f47395d88 +us-gov-east-1: ami-0af2c8e5bf3c334b0 +us-gov-west-1: ami-7b85fd1a +us-west-1: ami-066818f6a6be06fb5 +us-west-2: ami-07122cb5a96b7fee9 diff --git a/cloudformation/aws-parallelcluster.cfn.json b/cloudformation/aws-parallelcluster.cfn.json index f5692344ef..e91b60d363 100644 --- a/cloudformation/aws-parallelcluster.cfn.json +++ b/cloudformation/aws-parallelcluster.cfn.json @@ -1217,141 +1217,143 @@ "Mappings": { "AWSRegionOS2AMI": { "ap-northeast-1": { - "alinux": "ami-072ac3e57034c3bb9", - "centos6": "ami-0f5ea785473ed7ab5", - "centos7": "ami-09b175846250d37e0", - "ubuntu1404": "ami-0348c76e890f528a1", - "ubuntu1604": "ami-0446db0488549a643" + "alinux": "ami-0dcc18768374b4441", + "centos6": "ami-086781b933db101a5", + "centos7": "ami-09bae677f8f58842d", + "ubuntu1404": "ami-0939e3e1030d4f7d2", + "ubuntu1604": "ami-06b328a6ee03ccdf4" }, "ap-northeast-2": { - "alinux": "ami-080aecd84cc7434f0", - "centos6": "ami-07c9635710bc567e6", - "centos7": "ami-05a371ea785419539", - "ubuntu1404": "ami-0c4b80f78e7f62047", - "ubuntu1604": "ami-0b0bdd98877e4e506" + "alinux": "ami-022e7c66ccb807c9f", + "centos6": "ami-07d646c87d889d816", + "centos7": "ami-0eeb6c96d0e6c2d90", + "ubuntu1404": "ami-0481c6b023e2328b4", + "ubuntu1604": "ami-0179e2707f709f813" }, "ap-northeast-3": { - "alinux": "ami-0b0ebcca7a6d3cc89", - "centos6": "ami-0b65ed5f7a0ff726a", - "centos7": "ami-056bfe9c042992212", - "ubuntu1404": "ami-03aaa1e1ae3007767", - "ubuntu1604": "ami-088f0307cdd06a985" + "alinux": "ami-04402be7b85999df8", + "centos6": "ami-082ece6e5fe8f6fd1", + "centos7": "ami-084c0dbc04f722758", + "ubuntu1404": "ami-0a535e1d0bb7bc502", + "ubuntu1604": "ami-0c9b72bae5efc9f61" }, "ap-south-1": { - "alinux": "ami-0357da922b040bdb4", - "centos6": "ami-0e18648a2d5b5e83c", - "centos7": "ami-0cf72d39ece5fe7b8", - "ubuntu1404": "ami-04317bb153d6a8fff", - "ubuntu1604": "ami-066e4cfcc460304e0" + "alinux": "ami-0a14b1f0e7427a4bb", + "centos6": "ami-02389426198baf430", + "centos7": "ami-031f8f67a53de53fe", + "ubuntu1404": "ami-000e99acc047832ae", + "ubuntu1604": "ami-0f21d1eb3339ebd6a" }, "ap-southeast-1": { - "alinux": "ami-0fa2ec4e5f0402d8d", - "centos6": "ami-0f784e8d9290e0f50", - "centos7": "ami-08e0a6e0d5e22df29", - "ubuntu1404": "ami-000ce989867413e4e", - "ubuntu1604": "ami-0597821ac99a481e3" + "alinux": "ami-02079735c20c1ac4e", + "centos6": "ami-02105387481bd0ad0", + "centos7": "ami-041ca5c2f5b748966", + "ubuntu1404": "ami-09ca9a6a8fee71ba5", + "ubuntu1604": "ami-01899e9a659eb2267" }, "ap-southeast-2": { - "alinux": "ami-00bcdbe79f9f9360f", - "centos6": "ami-0e5559991f7845038", - "centos7": "ami-04b6949a410a967ed", - "ubuntu1404": "ami-09a0b2d2bdc669a6c", - "ubuntu1604": "ami-03c9d6bdf2a2cba07" + "alinux": "ami-0c65952cdec26ae39", + "centos6": "ami-0050fad9761b3957c", + "centos7": "ami-06c7f5584ecfcac3a", + "ubuntu1404": "ami-09646cc49a932a37e", + "ubuntu1604": "ami-049c81a79d55b2c8a" }, "ca-central-1": { - "alinux": "ami-06f86504395727a59", - "centos6": "ami-0ba632dba5f588a43", - "centos7": "ami-01cd6f223f4ca39af", - "ubuntu1404": "ami-0555c4a7c47384856", - "ubuntu1604": "ami-0e4208d9ab789da40" + "alinux": "ami-01f28f8381746746f", + "centos6": "ami-0e70755a47200df23", + "centos7": "ami-0afc2ea67b3963398", + "ubuntu1404": "ami-06ac5db73837bc364", + "ubuntu1604": "ami-0b8928a1f643684eb" }, "cn-north-1": { - "alinux": "ami-0c7128d1d555a95f3", - "ubuntu1404": "ami-0839700f2d6edb4e9", - "ubuntu1604": "ami-0e77f8ab0c91d2b9c" + "alinux": "ami-0da67c26ce2e8d111", + "ubuntu1404": "ami-07e16a5709c99f963", + "ubuntu1604": "ami-0ae967dc97d5eb57a" }, "cn-northwest-1": { - "alinux": "ami-0b838ce1114165ca1" + "alinux": "ami-03dc8f759de9de690", + "ubuntu1404": "ami-05348579489ba3673", + "ubuntu1604": "ami-0ba0b1ed49ce7b1b1" }, "eu-central-1": { - "alinux": "ami-00917e29a34aa32ee", - "centos6": "ami-04771fcea0287def6", - "centos7": "ami-00b98e76cefa6f7b8", - "ubuntu1404": "ami-0d35dcfebb277a64d", - "ubuntu1604": "ami-0457c581edab1213c" + "alinux": "ami-0ff6d2a86b9199e82", + "centos6": "ami-03979ebb9cfee2ccc", + "centos7": "ami-0205eaef48a9fc97a", + "ubuntu1404": "ami-0032889c720d364dc", + "ubuntu1604": "ami-002422c65a5bb1af8" }, "eu-north-1": { - "alinux": "ami-045b3be6347ece0fb", - "centos6": "ami-0adf183b1beb95c23", - "centos7": "ami-03bba2a04c7eee896", - "ubuntu1404": "ami-037dc7fd57f8c3151", - "ubuntu1604": "ami-03f64544dad6ae6be" + "alinux": "ami-0cb08caa10d113ed7", + "centos6": "ami-085a9ecbf9f64f65b", + "centos7": "ami-0420576e18a5fcb7c", + "ubuntu1404": "ami-0976908358f0bfa01", + "ubuntu1604": "ami-0d3c7ce730c73ab00" }, "eu-west-1": { - "alinux": "ami-0b05e2f00ebee148c", - "centos6": "ami-0c965c8bf7c698110", - "centos7": "ami-0d7719633ec6daed8", - "ubuntu1404": "ami-0c1b4a36f6da56db9", - "ubuntu1604": "ami-0aee65b23ac6121e5" + "alinux": "ami-0b5c32b12b9c340d0", + "centos6": "ami-070ba56e38a744df5", + "centos7": "ami-0f67868de5be7b0b3", + "ubuntu1404": "ami-0f5c65a609ad3afb4", + "ubuntu1604": "ami-00328873639859269" }, "eu-west-2": { - "alinux": "ami-0722446a78ea175f8", - "centos6": "ami-032a7702e546c7e42", - "centos7": "ami-05bcb384a4c8b0c5b", - "ubuntu1404": "ami-08b285fd75a7a483b", - "ubuntu1604": "ami-001486f35ffff87d9" + "alinux": "ami-0c218c2aaa7185f03", + "centos6": "ami-08553013e6e986028", + "centos7": "ami-057fa1a5314e3c414", + "ubuntu1404": "ami-08c2d96c2805037e7", + "ubuntu1604": "ami-0c1de72c6acf4b187" }, "eu-west-3": { - "alinux": "ami-0c54390b66173bd7d", - "centos6": "ami-0b7b602a10f83961c", - "centos7": "ami-054d53deecd76ad61", - "ubuntu1404": "ami-0653aa7d98f3033ec", - "ubuntu1604": "ami-05641d46059f19f8f" + "alinux": "ami-011e0eee21d52f23e", + "centos6": "ami-0afff5bc147c847e0", + "centos7": "ami-05b2808c2dc4fb82c", + "ubuntu1404": "ami-0f6cd6ac9be8f2b32", + "ubuntu1604": "ami-090d577bb6d08e95b" }, "sa-east-1": { - "alinux": "ami-0a47ff8d07b75a4e9", - "centos6": "ami-03309e4b0f7ef6ea5", - "centos7": "ami-089367ecc959c8bc4", - "ubuntu1404": "ami-0024bc42c6c9cc832", - "ubuntu1604": "ami-00451f8f5429960a6" + "alinux": "ami-0d154ae55458941fd", + "centos6": "ami-0635a9bdc378fe67f", + "centos7": "ami-0da1262e3c5d9af72", + "ubuntu1404": "ami-0d0da341da4802af9", + "ubuntu1604": "ami-08df8912b098a3f42" }, "us-east-1": { - "alinux": "ami-0d8d467ddca09f9ea", - "centos6": "ami-0e2b588a37264f1cb", - "centos7": "ami-0f942ac10a338af45", - "ubuntu1404": "ami-00144db1528d8942d", - "ubuntu1604": "ami-02b5b8dab90e057c4" + "alinux": "ami-0d130bdfab2037f8a", + "centos6": "ami-091f37e900368fe1a", + "centos7": "ami-031eb9c5390c0f8f6", + "ubuntu1404": "ami-017bfe181606779d8", + "ubuntu1604": "ami-08e1d33a6a64499de" }, "us-east-2": { - "alinux": "ami-0c8fd29db55b1fd99", - "centos6": "ami-09461a00d94eb74e5", - "centos7": "ami-0edbaef69d274e64e", - "ubuntu1404": "ami-0d7393fe329fac685", - "ubuntu1604": "ami-05f87726bd2802a79" + "alinux": "ami-00d2a10466c577ac7", + "centos6": "ami-055404b3df678da86", + "centos7": "ami-0050bd80a1cecfe37", + "ubuntu1404": "ami-043eb896e1bb2b948", + "ubuntu1604": "ami-0219fdb6f47395d88" }, "us-gov-east-1": { - "alinux": "ami-0646415808673ae07", - "ubuntu1404": "ami-06bcf13af6f8352b2", - "ubuntu1604": "ami-033192481089d3c64" + "alinux": "ami-0f5003922daf22962", + "ubuntu1404": "ami-060ced48ab370aadf", + "ubuntu1604": "ami-0af2c8e5bf3c334b0" }, "us-gov-west-1": { - "alinux": "ami-3b83f05a", - "ubuntu1404": "ami-a89deec9", - "ubuntu1604": "ami-f3ec9f92" + "alinux": "ami-ba83fbdb", + "ubuntu1404": "ami-32f98153", + "ubuntu1604": "ami-7b85fd1a" }, "us-west-1": { - "alinux": "ami-06258a4b05a6eb611", - "centos6": "ami-04d12cccd51b29965", - "centos7": "ami-036c33c35fa178b86", - "ubuntu1404": "ami-0e9ab67cbeffe41b6", - "ubuntu1604": "ami-03ff95bfd61b9cb5e" + "alinux": "ami-0b6f7961ee845966e", + "centos6": "ami-0e438402399c457d7", + "centos7": "ami-09bd008b253048b80", + "ubuntu1404": "ami-0d48f8a9d5735efde", + "ubuntu1604": "ami-066818f6a6be06fb5" }, "us-west-2": { - "alinux": "ami-020c1ec32ad429b44", - "centos6": "ami-0e2efd8528c9ddb07", - "centos7": "ami-0bbd399489108361e", - "ubuntu1404": "ami-056be711a3a572d20", - "ubuntu1604": "ami-0d6d6a4cbeb63a57d" + "alinux": "ami-0d611d90619419e93", + "centos6": "ami-0651b7e7cfde4b3a0", + "centos7": "ami-003da28849bc413f5", + "ubuntu1404": "ami-0169da6ccb6347f50", + "ubuntu1604": "ami-07122cb5a96b7fee9" } }, "OSFeatures": { From 4bf3cfba714484ac441268f4eb767488ac6c3f52 Mon Sep 17 00:00:00 2001 From: Luca Carrogu Date: Tue, 11 Jun 2019 15:08:28 +0200 Subject: [PATCH 121/121] Add new tag aws-parallelcluster-networking The new tag contains the EFA property Signed-off-by: Luca Carrogu --- cloudformation/aws-parallelcluster.cfn.json | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/cloudformation/aws-parallelcluster.cfn.json b/cloudformation/aws-parallelcluster.cfn.json index e91b60d363..ce46f6efaa 100644 --- a/cloudformation/aws-parallelcluster.cfn.json +++ b/cloudformation/aws-parallelcluster.cfn.json @@ -2079,6 +2079,12 @@ ] } }, + { + "Key": "aws-parallelcluster-networking", + "Value": { + "Fn::Sub": "EFA=${EFA}" + } + }, { "Key": "aws-parallelcluster-filesystem", "Value": { @@ -2735,6 +2741,13 @@ }, "PropagateAtLaunch": true }, + { + "Key": "aws-parallelcluster-networking", + "Value": { + "Fn::Sub": "EFA=${EFA}" + }, + "PropagateAtLaunch": true + }, { "Key": "aws-parallelcluster-filesystem", "Value": {