diff --git a/.github/no-response.yml b/.github/no-response.yml index f8f7ef35ca..82e40cc8a4 100644 --- a/.github/no-response.yml +++ b/.github/no-response.yml @@ -1,7 +1,7 @@ # Configuration for probot-no-response - https://github.com/probot/no-response # Number of days of inactivity before an Issue is closed for lack of response -daysUntilClose: 14 +daysUntilClose: 7 # Label requiring a response responseRequiredLabel: closing-soon-if-no-response # Comment to post when closing an Issue for lack of response. Set to `false` to disable diff --git a/.gitignore b/.gitignore index ae57b2eea5..7eaf8ce5b6 100644 --- a/.gitignore +++ b/.gitignore @@ -13,3 +13,4 @@ build/ .coverage assets/ report.html +tests_outputs/ diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 15f0fad6e7..8d99858b53 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -2,6 +2,72 @@ CHANGELOG ========= +2.4.0 +===== + +**ENHANCEMENTS** + +* Add support for EFA on Centos 7, Amazon Linux and Ubuntu 1604 +* Add support for Ubuntu in China region ``cn-northwest-1`` + +* SGE: + + * process nodes added to or removed from the cluster in batches in order to speed up cluster scaling. + * scale up only if required slots/nodes can be satisfied + * scale down if pending jobs have unsatisfiable CPU/nodes requirements + * add support for jobs in hold/suspended state (this includes job dependencies) + * automatically terminate and replace faulty or unresponsive compute nodes + * add retries in case of failures when adding or removing nodes + * configure scheduler to handle rescheduling and cancellation of jobs running on failing or terminated nodes + +* Slurm: + + * scale up only if required slots/nodes can be satisfied + * scale down if pending jobs have unsatisfiable CPU/nodes requirements + * automatically terminate and replace faulty or unresponsive compute nodes + * decrease SlurmdTimeout to 120 seconds to speed up replacement of faulty nodes + +* Automatically replace compute instances that fail initialization and dump logs to shared home directory. +* Dynamically fetch compute instance type and cluster size in order to support updates in scaling daemons +* Always use full master FQDN when mounting NFS on compute nodes. This solves some issues occurring with some networking + setups and custom DNS configurations +* List the version and status during ``pcluster list`` +* Remove double quoting of the post_install args +* ``awsbsub``: use override option to set the number of nodes rather than creating multiple JobDefinitions +* Add support for AWS_PCLUSTER_CONFIG_FILE env variable to specify pcluster config file + +**CHANGES** + +* Update openmpi library to version 3.1.4 on Centos 7, Amazon Linux and Ubuntu 1604. This also changes the default + openmpi path to ``/opt/amazon/efa/bin/`` and the openmpi module name to ``openmpi/3.1.4`` +* Set soft and hard ulimit on open files to 10000 for all supported OSs +* For a better security posture, we're removing AWS credentials from the ``parallelcluster`` config file + Credentials can be now setup following the canonical procedure used for the aws cli +* When using FSx or EFS do not enforce in sanity check that the compute security group is open to 0.0.0.0/0 +* When updating an existing cluster, the same template version is now used, no matter the pcluster cli version +* SQS messages that fail to be processed in ``sqswatcher`` are now re-queued only 3 times and not forever +* Reset ``nodewatcher`` idletime to 0 when the host becomes essential for the cluster (because of min size of ASG or + because there are pending jobs in the scheduler queue) +* SGE: a node is considered as busy when in one of the following states "u", "C", "s", "d", "D", "E", "P", "o". + This allows a quick replacement of the node without waiting for the ``nodewatcher`` to terminate it. +* Do not update DynamoDB table on cluster updates in order to avoid hitting strict API limits (1 update per day). + +**BUG FIXES** + +* Fix issue that was preventing Torque from being used on Centos 7 +* Start node daemons at the end of instance initialization. The time spent for post-install script and node + initialization is not counted as part of node idletime anymore. +* Fix issue which was causing an additional and invalid EBS mount point to be added in case of multiple EBS +* Install Slurm libpmpi/libpmpi2 that is distributed in a separate package since Slurm 17 +* ``pcluster ssh`` command now works for clusters with ``use_public_ips = false`` +* Slurm: add "BeginTime", "NodeDown", "Priority" and "ReqNodeNotAvail" to the pending reasons that trigger + a cluster scaling +* Add a timeout on remote commands execution so that the daemons are not stuck if the compute node is unresponsive +* Fix an edge case that was causing the ``nodewatcher`` to hang forever in case the node had become essential to the + cluster during a call to ``self_terminate``. +* Fix ``pcluster start/stop`` commands when used with an ``awsbatch`` cluster + + 2.3.1 ===== diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 6b15a46776..60963ea9c8 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -56,6 +56,6 @@ If you discover a potential security issue in this project we ask that you notif ## Licensing -See the [LICENSE](https://github.com/aws/aws-parallelcluster/blob/develop/LICENSE) file for our project's licensing. We will ask you to confirm the licensing of your contribution. +See the [LICENSE](https://github.com/aws/aws-parallelcluster/blob/develop/LICENSE.txt) file for our project's licensing. We will ask you to confirm the licensing of your contribution. We may ask you to sign a [Contributor License Agreement (CLA)](http://en.wikipedia.org/wiki/Contributor_License_Agreement) for larger changes. diff --git a/README.rst b/README.rst index b058b60ee5..0020474fb1 100644 --- a/README.rst +++ b/README.rst @@ -41,8 +41,6 @@ Then, run pcluster configure: $ pcluster configure Cluster Template [default]: - AWS Access Key ID []: - AWS Secret Access Key ID []: Acceptable Values for AWS Region ID: ap-south-1 ... @@ -105,23 +103,8 @@ HPC forum which may be helpful:https://forums.aws.amazon.com/forum.jspa?forumID= Changes ------- -CfnCluster 1.6 IAM Change -========================= -Between CfnCluster 1.5.4 and 1.6.0 we made a change to the CfnClusterInstancePolicy that adds “s3:GetObject” permissions -on objects in -cfncluster bucket, "autoscaling:SetDesiredCapacity", "autoscaling:DescribeTags" permissions and -"cloudformation:DescribeStacks" permissions on ::stack/cfncluster-*. +CfnCluster to AWS ParallelCluster +================================= +In Version `2.0.0`, we changed the name of CfnCluster to AWS ParallelCluster. With that name change we released several new features, which you can read about in the `Change Log`_. -If you’re using a custom policy (e.g. you specify "ec2_iam_role" in your config) be sure it includes this new permission. See https://aws-parallelcluster.readthedocs.io/en/latest/iam.html - -CfnCluster 1.5 IAM Change -========================= -Between CfnCluster 1.4.2 and 1.5.0 we made a change to the CfnClusterInstancePolicy that adds “ec2:DescribeVolumes” permissions. If you’re using a custom policy (e.g. you specify "ec2_iam_role" in your config) be sure it includes this new permission. See https://aws-parallelcluster.readthedocs.io/en/latest/iam.html - -CfnCluster 1.2 and Earlier -========================== - -For various security (on our side) and maintenance reasons, CfnCluster -1.2 and earlier have been deprecated. AWS-side resources necessary to -create a cluster with CfnCluster 1.2 or earlier are no longer -available. Existing clusters will continue to operate, but new -clusters can not be created. +.. _`Change Log`: https://github.com/aws/aws-parallelcluster/blob/develop/CHANGELOG.rst#200 diff --git a/amis.txt b/amis.txt index a4c0fd5dd6..7dfd7a8dbb 100644 --- a/amis.txt +++ b/amis.txt @@ -1,100 +1,102 @@ # alinux -ap-northeast-1: ami-0af8c1a29f58c3b91 -ap-northeast-2: ami-036c289fda8701f9d -ap-northeast-3: ami-000902aa3082732ce -ap-south-1: ami-00ff6216daa4b0a69 -ap-southeast-1: ami-03b015a13daa9ff8d -ap-southeast-2: ami-0c2528255cc7c4cec -ca-central-1: ami-05bad5df22b9502e5 -cn-north-1: ami-0227bedfc6798cba1 -cn-northwest-1: ami-08143603c5f390f20 -eu-central-1: ami-003262ea853b26050 -eu-north-1: ami-06cac8aed0729f14c -eu-west-1: ami-0691d6d6d4d209e09 -eu-west-2: ami-0d241a5c57ee3421d -eu-west-3: ami-0e59dd1d2794a857c -sa-east-1: ami-07b044055a13cf93e -us-east-1: ami-0f8b01b1377483305 -us-east-2: ami-049afa5b53a7880d8 -us-gov-east-1: ami-02ee5c66a10526bd1 -us-gov-west-1: ami-7da7d01c -us-west-1: ami-02c87842ea944292e -us-west-2: ami-09b457d5cba24514a +ap-northeast-1: ami-0dcc18768374b4441 +ap-northeast-2: ami-022e7c66ccb807c9f +ap-northeast-3: ami-04402be7b85999df8 +ap-south-1: ami-0a14b1f0e7427a4bb +ap-southeast-1: ami-02079735c20c1ac4e +ap-southeast-2: ami-0c65952cdec26ae39 +ca-central-1: ami-01f28f8381746746f +cn-north-1: ami-0da67c26ce2e8d111 +cn-northwest-1: ami-03dc8f759de9de690 +eu-central-1: ami-0ff6d2a86b9199e82 +eu-north-1: ami-0cb08caa10d113ed7 +eu-west-1: ami-0b5c32b12b9c340d0 +eu-west-2: ami-0c218c2aaa7185f03 +eu-west-3: ami-011e0eee21d52f23e +sa-east-1: ami-0d154ae55458941fd +us-east-1: ami-0d130bdfab2037f8a +us-east-2: ami-00d2a10466c577ac7 +us-gov-east-1: ami-0f5003922daf22962 +us-gov-west-1: ami-ba83fbdb +us-west-1: ami-0b6f7961ee845966e +us-west-2: ami-0d611d90619419e93 # centos6 -ap-northeast-1: ami-0476984f547d1f4f2 -ap-northeast-2: ami-06ecb1e81881cd450 -ap-northeast-3: ami-04d195b55ddf56228 -ap-south-1: ami-0b1abd2bf8810487c -ap-southeast-1: ami-0576b4b2db8272abf -ap-southeast-2: ami-09a18baab0a142123 -ca-central-1: ami-0aa03a3f1b737c651 -eu-central-1: ami-092bd9c46746d940b -eu-north-1: ami-07b83433077d8345b -eu-west-1: ami-09880c7e25df69af8 -eu-west-2: ami-0eba961d9f30431b2 -eu-west-3: ami-0d0b243ac76765544 -sa-east-1: ami-0dfdc6ab8bf7935ea -us-east-1: ami-00f71e3be938f3077 -us-east-2: ami-0b29637d31cf774aa -us-west-1: ami-08dc392067bcf9807 -us-west-2: ami-0fa309858f6ce66ee +ap-northeast-1: ami-086781b933db101a5 +ap-northeast-2: ami-07d646c87d889d816 +ap-northeast-3: ami-082ece6e5fe8f6fd1 +ap-south-1: ami-02389426198baf430 +ap-southeast-1: ami-02105387481bd0ad0 +ap-southeast-2: ami-0050fad9761b3957c +ca-central-1: ami-0e70755a47200df23 +eu-central-1: ami-03979ebb9cfee2ccc +eu-north-1: ami-085a9ecbf9f64f65b +eu-west-1: ami-070ba56e38a744df5 +eu-west-2: ami-08553013e6e986028 +eu-west-3: ami-0afff5bc147c847e0 +sa-east-1: ami-0635a9bdc378fe67f +us-east-1: ami-091f37e900368fe1a +us-east-2: ami-055404b3df678da86 +us-west-1: ami-0e438402399c457d7 +us-west-2: ami-0651b7e7cfde4b3a0 # centos7 -ap-northeast-1: ami-0f13f45e966236e46 -ap-northeast-2: ami-016c726d8902d133c -ap-northeast-3: ami-037c3a13cd142c8f8 -ap-south-1: ami-06b7212503b9d9637 -ap-southeast-1: ami-0c39937e9ae643ecd -ap-southeast-2: ami-0164dbfb6b7b938f5 -ca-central-1: ami-0ee7cb4d2673e78de -eu-central-1: ami-0bcced571d9cc0142 -eu-north-1: ami-00255a59ce6bd8147 -eu-west-1: ami-00c07933e0ea22f7d -eu-west-2: ami-09aa34259643c50eb -eu-west-3: ami-04ce6f74e1070a795 -sa-east-1: ami-0a625e9dcf563db57 -us-east-1: ami-0658a809b3e89b0c9 -us-east-2: ami-07cef254f8886ea4e -us-west-1: ami-0454b933360a077e4 -us-west-2: ami-03b7e311ae2f4aacb +ap-northeast-1: ami-09bae677f8f58842d +ap-northeast-2: ami-0eeb6c96d0e6c2d90 +ap-northeast-3: ami-084c0dbc04f722758 +ap-south-1: ami-031f8f67a53de53fe +ap-southeast-1: ami-041ca5c2f5b748966 +ap-southeast-2: ami-06c7f5584ecfcac3a +ca-central-1: ami-0afc2ea67b3963398 +eu-central-1: ami-0205eaef48a9fc97a +eu-north-1: ami-0420576e18a5fcb7c +eu-west-1: ami-0f67868de5be7b0b3 +eu-west-2: ami-057fa1a5314e3c414 +eu-west-3: ami-05b2808c2dc4fb82c +sa-east-1: ami-0da1262e3c5d9af72 +us-east-1: ami-031eb9c5390c0f8f6 +us-east-2: ami-0050bd80a1cecfe37 +us-west-1: ami-09bd008b253048b80 +us-west-2: ami-003da28849bc413f5 # ubuntu1404 -ap-northeast-1: ami-0ce1c5516c087ef8d -ap-northeast-2: ami-0744c53e9582abcd4 -ap-northeast-3: ami-0d0faa548bcca5fac -ap-south-1: ami-00721e9f7f8235dba -ap-southeast-1: ami-03df9d0a89a448c63 -ap-southeast-2: ami-06116e2159f6ba6bf -ca-central-1: ami-0d180013cf3d07fc9 -cn-north-1: ami-0ef85bbc4ba66c301 -eu-central-1: ami-04b116ae9a44c861f -eu-north-1: ami-0de1c666987bbdb1f -eu-west-1: ami-01b114f6a268d6a42 -eu-west-2: ami-0f9ad3c001b80325a -eu-west-3: ami-0f921986737ab8306 -sa-east-1: ami-0d1d30ad051235185 -us-east-1: ami-0422aa8ec2e452870 -us-east-2: ami-02447e477105886bd -us-gov-east-1: ami-03538e53996b83762 -us-gov-west-1: ami-90a2d5f1 -us-west-1: ami-0f4a99f972b9b4882 -us-west-2: ami-04caeb57df33aba89 +ap-northeast-1: ami-0939e3e1030d4f7d2 +ap-northeast-2: ami-0481c6b023e2328b4 +ap-northeast-3: ami-0a535e1d0bb7bc502 +ap-south-1: ami-000e99acc047832ae +ap-southeast-1: ami-09ca9a6a8fee71ba5 +ap-southeast-2: ami-09646cc49a932a37e +ca-central-1: ami-06ac5db73837bc364 +cn-north-1: ami-07e16a5709c99f963 +cn-northwest-1: ami-05348579489ba3673 +eu-central-1: ami-0032889c720d364dc +eu-north-1: ami-0976908358f0bfa01 +eu-west-1: ami-0f5c65a609ad3afb4 +eu-west-2: ami-08c2d96c2805037e7 +eu-west-3: ami-0f6cd6ac9be8f2b32 +sa-east-1: ami-0d0da341da4802af9 +us-east-1: ami-017bfe181606779d8 +us-east-2: ami-043eb896e1bb2b948 +us-gov-east-1: ami-060ced48ab370aadf +us-gov-west-1: ami-32f98153 +us-west-1: ami-0d48f8a9d5735efde +us-west-2: ami-0169da6ccb6347f50 # ubuntu1604 -ap-northeast-1: ami-041f6050eff86f024 -ap-northeast-2: ami-0df4c1dafbfee5031 -ap-northeast-3: ami-08d3ef362e1d06e56 -ap-south-1: ami-0ef148f6ae69767d7 -ap-southeast-1: ami-0b63a13236ce5b8d9 -ap-southeast-2: ami-0f5a3072f23556b07 -ca-central-1: ami-0c88262f6fd2738fc -cn-north-1: ami-017ea2a40c48f9af4 -eu-central-1: ami-06a21b6e0815065a4 -eu-north-1: ami-0418320f06192d788 -eu-west-1: ami-0809bc00666e41cfa -eu-west-2: ami-04d8578267aaa2ac4 -eu-west-3: ami-02de781189ccb9f92 -sa-east-1: ami-088d6a838e8dc6b11 -us-east-1: ami-0a8c4ea1bd1ff7651 -us-east-2: ami-04d5c390495e0509f -us-gov-east-1: ami-0bfb76fbbbb68030d -us-gov-west-1: ami-eeaed98f -us-west-1: ami-0a33d79d5f920cc2c -us-west-2: ami-00050b3048393bc12 +ap-northeast-1: ami-06b328a6ee03ccdf4 +ap-northeast-2: ami-0179e2707f709f813 +ap-northeast-3: ami-0c9b72bae5efc9f61 +ap-south-1: ami-0f21d1eb3339ebd6a +ap-southeast-1: ami-01899e9a659eb2267 +ap-southeast-2: ami-049c81a79d55b2c8a +ca-central-1: ami-0b8928a1f643684eb +cn-north-1: ami-0ae967dc97d5eb57a +cn-northwest-1: ami-0ba0b1ed49ce7b1b1 +eu-central-1: ami-002422c65a5bb1af8 +eu-north-1: ami-0d3c7ce730c73ab00 +eu-west-1: ami-00328873639859269 +eu-west-2: ami-0c1de72c6acf4b187 +eu-west-3: ami-090d577bb6d08e95b +sa-east-1: ami-08df8912b098a3f42 +us-east-1: ami-08e1d33a6a64499de +us-east-2: ami-0219fdb6f47395d88 +us-gov-east-1: ami-0af2c8e5bf3c334b0 +us-gov-west-1: ami-7b85fd1a +us-west-1: ami-066818f6a6be06fb5 +us-west-2: ami-07122cb5a96b7fee9 diff --git a/cli/awsbatch/awsbsub.py b/cli/awsbatch/awsbsub.py index 57791153cb..1a13c81d91 100644 --- a/cli/awsbatch/awsbsub.py +++ b/cli/awsbatch/awsbsub.py @@ -24,7 +24,7 @@ import argparse from awsbatch.common import AWSBatchCliConfig, Boto3ClientFactory, config_logger -from awsbatch.utils import S3Uploader, fail, get_job_definition_name_by_arn, shell_join +from awsbatch.utils import S3Uploader, fail, shell_join def _get_parser(): @@ -480,16 +480,13 @@ def run( # noqa: C901 FIXME } if nodes: - # Multi Node parallel submission - job_definition_version = self.__get_mnp_job_definition_version( - base_job_definition_arn=job_definition, nodes=nodes - ) - submission_args.update({"jobDefinition": job_definition_version}) + submission_args.update({"jobDefinition": job_definition}) - target_nodes = "0:%d" % (nodes - 1) + target_nodes = "0:" # populate node overrides node_overrides = { - "nodePropertyOverrides": [{"targetNodes": target_nodes, "containerOverrides": container_overrides}] + "numNodes": nodes, + "nodePropertyOverrides": [{"targetNodes": target_nodes, "containerOverrides": container_overrides}], } submission_args.update({"nodeOverrides": node_overrides}) if timeout: @@ -508,89 +505,6 @@ def run( # noqa: C901 FIXME except Exception as e: fail("Error submitting job to AWS Batch. Failed with exception: %s" % e) - def __get_mnp_job_definition_version(self, base_job_definition_arn, nodes): - """ - Get (and create if required) job definition version to use for the submission. - - :return: job definition arn - """ - # Check if there is already a job definition for the given number of nodes - job_definition_found = self.__search_for_job_definition(base_job_definition_arn, nodes) - if job_definition_found: - job_definition_arn = job_definition_found["jobDefinitionArn"] - self.log.info("Found existing Job definition (%s) with (%i) nodes" % (job_definition_arn, nodes)) - else: - self.log.info("Creating new Job definition with (%i) nodes" % nodes) - # create a new job definition revision - job_definition_arn = self.__register_new_job_definition(base_job_definition_arn, nodes) - - self.log.info("Job definition to use is (%s)" % job_definition_arn) - return job_definition_arn - - def __search_for_job_definition(self, base_job_definition, nodes): - """ - Search for existing job definition with the same name of the base_job_definition and the same number of nodes. - - :param base_job_definition: job definition arn - :param nodes: number of nodes - :return: the found jobDefinition object or None - """ - job_definition_found = None - base_job_definition_name = get_job_definition_name_by_arn(base_job_definition) - try: - next_token = "" - while next_token is not None: - response = self.batch_client.describe_job_definitions( - jobDefinitionName=base_job_definition_name, status="ACTIVE", nextToken=next_token - ) - for job_definition in response["jobDefinitions"]: - if job_definition["nodeProperties"]["numNodes"] == nodes: - job_definition_found = job_definition - break - next_token = response.get("nextToken") - except Exception as e: - fail("Error listing job definition. Failed with exception: %s" % e) - - return job_definition_found - - def __register_new_job_definition(self, base_job_definition_arn, nodes): - """ - Register a new job definition. - - It uses the base_job_definition_arn as starting point for the nodeRangeProperties. - - :param base_job_definition_arn: job definition arn to use as starting point - :param nodes: nuber of nodes to set in the job definition - :return: the ARN of the created job definition - """ - try: - # get base job definition and reuse its nodeRangeProperties - response = self.batch_client.describe_job_definitions( - jobDefinitions=[base_job_definition_arn], status="ACTIVE" - ) - job_definition = response["jobDefinitions"][0] - - # create new job definition - response = self.batch_client.register_job_definition( - jobDefinitionName=job_definition["jobDefinitionName"], - type="multinode", - nodeProperties={ - "numNodes": nodes, - "mainNode": 0, - "nodeRangeProperties": [ - { - "targetNodes": "0:%d" % (nodes - 1), - "container": job_definition["nodeProperties"]["nodeRangeProperties"][0]["container"], - } - ], - }, - ) - job_definition_arn = response["jobDefinitionArn"] - except Exception as e: - fail("Error listing job definition. Failed with exception: %s" % e) - - return job_definition_arn - def main(): """Command entrypoint.""" diff --git a/cli/pcluster/cfnconfig.py b/cli/pcluster/cfnconfig.py index 3cfc145787..489cb4d3d2 100644 --- a/cli/pcluster/cfnconfig.py +++ b/cli/pcluster/cfnconfig.py @@ -30,7 +30,7 @@ from botocore.exceptions import ClientError from pcluster.config_sanity import ResourceValidator -from pcluster.utils import get_instance_vcpus +from pcluster.utils import get_instance_vcpus, get_supported_features class ParallelClusterConfig(object): @@ -104,6 +104,9 @@ def __init__(self, args): # Initialize aliases public attributes self.__init_aliases() + # efa checks + self.__init_efa_parameters() + # Handle extra parameters supplied on command-line try: if self.args.extra_parameters is not None: @@ -128,10 +131,13 @@ def __init_config(self): :return: configuration object """ - # Determine config file name based on args or default + # Determine config file name based on args, env or default if hasattr(self.args, "config_file") and self.args.config_file is not None: config_file = self.args.config_file default_config = False + elif "AWS_PCLUSTER_CONFIG_FILE" in os.environ: + config_file = os.environ["AWS_PCLUSTER_CONFIG_FILE"] + default_config = False else: config_file = os.path.expanduser(os.path.join("~", ".parallelcluster", "config")) default_config = True @@ -354,6 +360,12 @@ def __init_vpc_parameters(self): "VPC section [%s] used in [%s] section is not defined" % (vpc_section, self.__cluster_section) ) + # Check that cidr and public ips are not both set + cidr_value = self.__config.get(vpc_section, "compute_subnet_cidr", fallback=None) + public_ips = self.__config.getboolean(vpc_section, "use_public_ips", fallback=True) + if self.__sanity_check: + ResourceValidator.validate_vpc_coherence(cidr_value, public_ips) + def __check_account_capacity(self): """Try to launch the requested number of instances to verify Account limits.""" if self.parameters.get("Scheduler") == "awsbatch" or self.parameters.get("ClusterType", "ondemand") == "spot": @@ -460,10 +472,13 @@ def __init_size_parameters(self): self.parameters["MaxSize"] = "10" size_parameters = OrderedDict( - initial_queue_size=("InitialQueueSize", None), - maintain_initial_size=("MaintainInitialSize", None), - max_queue_size=("MaxQueueSize", None), + [ + ("initial_queue_size", ("InitialQueueSize", None)), + ("maintain_initial_size", ("MaintainInitialSize", None)), + ("max_queue_size", ("MaxQueueSize", None)), + ] ) + for key in size_parameters: try: __temp__ = self.__config.get(self.__cluster_section, key) @@ -521,6 +536,23 @@ def __init_cluster_parameters(self): except configparser.NoOptionError: pass + def __init_efa_parameters(self): + try: + __temp__ = self.__config.get(self.__cluster_section, "enable_efa") + if __temp__ != "compute": + self.__fail("valid values for enable_efa = compute") + + supported_features = get_supported_features(self.region, "efa") + valid_instances = supported_features.get("instances") + + self.__validate_instance("EFA", self.parameters.get("ComputeInstanceType"), valid_instances) + self.__validate_os("EFA", self.__get_os(), ["alinux", "centos7", "ubuntu1604"]) + self.__validate_scheduler("EFA", self.__get_scheduler(), ["sge", "slurm", "torque"]) + self.__validate_resource("EFA", self.parameters) + self.parameters["EFA"] = __temp__ + except configparser.NoOptionError: + pass + def __init_extra_json_parameter(self): """Check for extra_json = { "cluster" : ... } configuration parameters and map to "cfncluster".""" extra_json = self.parameters.get("ExtraJson") @@ -587,12 +619,26 @@ def __check_option_absent_awsbatch(self, option): if self.__config.has_option(self.__cluster_section, option): self.__fail("option %s cannot be used with awsbatch" % option) + def __get_scheduler(self): + scheduler = "sge" + if self.__config.has_option(self.__cluster_section, "scheduler"): + scheduler = self.__config.get(self.__cluster_section, "scheduler") + return scheduler + def __get_os(self): base_os = "alinux" if self.__config.has_option(self.__cluster_section, "base_os"): base_os = self.__config.get(self.__cluster_section, "base_os") return base_os + def __validate_instance(self, service, instance, valid_instances): + if instance not in valid_instances: + self.__fail("%s can only be used with the following instances: %s" % (service, valid_instances)) + + def __validate_scheduler(self, service, scheduler, supported_schedulers): + if scheduler not in supported_schedulers: + self.__fail("%s supports following Schedulers: %s" % (service, supported_schedulers)) + def __validate_os(self, service, baseos, supported_oses): if baseos not in supported_oses: self.__fail("%s supports following OSes: %s" % (service, supported_oses)) diff --git a/cli/pcluster/cli.py b/cli/pcluster/cli.py index dd0f18aa5f..dd2727b913 100644 --- a/cli/pcluster/cli.py +++ b/cli/pcluster/cli.py @@ -22,6 +22,8 @@ from pcluster import easyconfig, pcluster +LOGGER = logging.getLogger("pcluster.pcluster") + def create(args): pcluster.create(args) @@ -56,7 +58,8 @@ def update(args): def version(args): - pcluster.version(args) + version = pcluster.version() + LOGGER.info(version) def start(args): @@ -154,7 +157,7 @@ def _get_parser(): "--template-url", help="Specifies the URL for a custom CloudFormation template, " "if it was used at creation time.", ) - pcreate.add_argument("-t", "--cluster-template", help="Indicates which cluster template to use.") + pcreate.add_argument("-t", "--cluster-template", help="Indicates which section of the cluster template to use.") pcreate.add_argument("-p", "--extra-parameters", type=json.loads, help="Adds extra parameters to the stack create.") pcreate.add_argument("-g", "--tags", type=json.loads, help="Specifies additional tags to be added to the stack.") pcreate.set_defaults(func=create) @@ -162,7 +165,7 @@ def _get_parser(): # update command subparser pupdate = subparsers.add_parser( "update", - help="Updates a running cluster using the values in the config " "file or in a TEMPLATE_URL provided.", + help="Updates a running cluster using the values in the config file.", epilog="When the command is called and it begins polling for the status of that call, " 'it is safe to "Ctrl-C" out. You can always return to that status by ' 'calling "pcluster status mycluster".', @@ -178,8 +181,7 @@ def _get_parser(): default=False, help="Disable CloudFormation stack rollback on error.", ) - pupdate.add_argument("-u", "--template-url", help="Specifies the URL for a custom CloudFormation template.") - pupdate.add_argument("-t", "--cluster-template", help="Indicates which cluster template to use.") + pupdate.add_argument("-t", "--cluster-template", help="Indicates which section of the cluster template to use.") pupdate.add_argument("-p", "--extra-parameters", help="Adds extra parameters to the stack update.") pupdate.add_argument( "-rd", @@ -245,6 +247,7 @@ def _get_parser(): help="Displays a list of stacks associated with AWS ParallelCluster.", epilog="This command lists the names of any CloudFormation stacks named parallelcluster-*", ) + plist.add_argument("--color", action="store_true", default=False, help="Display the cluster status in color.") _addarg_config(plist) _addarg_region(plist) plist.set_defaults(func=list_stacks) diff --git a/cli/pcluster/config_sanity.py b/cli/pcluster/config_sanity.py index 4c3eeec998..f58571a42f 100644 --- a/cli/pcluster/config_sanity.py +++ b/cli/pcluster/config_sanity.py @@ -23,7 +23,7 @@ import boto3 from botocore.exceptions import ClientError -from pcluster.utils import get_instance_vcpus, get_supported_batch_instances +from pcluster.utils import get_instance_vcpus, get_supported_features class ResourceValidator(object): @@ -46,29 +46,39 @@ def __get_partition(self): return "aws-us-gov" return "aws" + @staticmethod + def validate_vpc_coherence(cidr_value, public_ip): + """ + Check that cidr_value and public_ip parameters are not conflicting. + + :param cidr_value: the value of compute_subnet_cidr set by the user (default should be None) + :param public_ip: the value of use_public_ips set by the user (default should be True) + """ + if cidr_value and public_ip is False: + ResourceValidator.__fail("VPC COHERENCE", "compute_subnet_cidr needs use_public_ips to be true") + @staticmethod def __check_sg_rules_for_port(rule, port_to_check): """ - Verify if the security group rule accepts connections to the given port. + Verify if the security group rule accepts connections on the given port. :param rule: The rule to check :param port_to_check: The port to check :return: True if the rule accepts connection, False otherwise """ - port = rule.get("FromPort") - ip_rules = rule.get("IpRanges") - group = rule.get("UserIdGroupPairs") + from_port = rule.get("FromPort") + to_port = rule.get("ToPort") + ip_protocol = rule.get("IpProtocol") - is_valid = False - for ip_rule in ip_rules: - ip = ip_rule.get("CidrIp") - # An existing rule is valid for EFS if, it allows all traffic(0.0.0.0/0) - # from all ports or the given port, and does not have a security group restriction - if (not port or port == port_to_check) and ip == "0.0.0.0/0" and not group: - is_valid = True - break + # if ip_protocol is -1, all ports are allowed + if ip_protocol == "-1": + return True + # tcp == protocol 6, + # if the ip_protocol is tcp, from_port and to_port must >= 0 and <= 65535 + if (ip_protocol in ["tcp", "6"]) and (from_port <= port_to_check <= to_port): + return True - return is_valid + return False def __check_efs_fs_id(self, ec2, efs, resource_value): # noqa: C901 FIXME!!! try: @@ -112,8 +122,8 @@ def __check_efs_fs_id(self, ec2, efs, resource_value): # noqa: C901 FIXME!!! self.__fail( "EFSFSId", "There is an existing Mount Target %s in the Availability Zone %s for EFS %s, " - "and it does not have a security group with inbound and outbound rules that support NFS. " - "Please modify the Mount Target's security group, or delete the Mount Target." + "but it does not have a security group that allows inbound and outbound rules to support NFS. " + "Please modify the Mount Target's security group, to allow traffic on port 2049." % (mt_id, availability_zone, resource_value[0]), ) except ClientError as e: @@ -171,7 +181,7 @@ def __check_fsx_fs_id(self, ec2, fsx, resource_value): "FSXFSId", "The current security group settings on file system %s does not satisfy " "mounting requirement. The file system must be associated to a security group that allows " - "inbound and outbound TCP traffic from 0.0.0.0/0 through port 988." % resource_value[0], + "inbound and outbound TCP traffic through port 988." % resource_value[0], ) return True except ClientError as e: @@ -212,6 +222,53 @@ def __validate_fsx_parameters(self, resource_type, resource_value): if not (1 <= int(resource_value[0]) <= 512000): self.__fail(resource_type, "has a minimum size of 1 MiB, and max size of 512,000 MiB") + def __validate_efa_sg(self, resource_type, sg_id): + try: + ec2 = boto3.client( + "ec2", + region_name=self.region, + aws_access_key_id=self.aws_access_key_id, + aws_secret_access_key=self.aws_secret_access_key, + ) + sg = ec2.describe_security_groups(GroupIds=[sg_id]).get("SecurityGroups")[0] + in_rules = sg.get("IpPermissions") + out_rules = sg.get("IpPermissionsEgress") + + allowed_in = False + allowed_out = False + for rule in in_rules: + # UserIdGroupPairs is always of length 1, so grabbing 0th object is ok + if ( + rule.get("IpProtocol") == "-1" + and len(rule.get("UserIdGroupPairs")) > 0 + and rule.get("UserIdGroupPairs")[0].get("GroupId") == sg_id + ): + allowed_in = True + break + for rule in out_rules: + if ( + rule.get("IpProtocol") == "-1" + and len(rule.get("UserIdGroupPairs")) > 0 + and rule.get("UserIdGroupPairs")[0].get("GroupId") == sg_id + ): + allowed_out = True + break + if not (allowed_in and allowed_out): + self.__fail( + resource_type, + "VPC Security Group %s must allow all traffic in and out from itself. " + "See https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/efa-start.html#efa-start-security" % sg_id, + ) + except ClientError as e: + self.__fail(resource_type, e.response.get("Error").get("Message")) + + def __validate_efa_parameters(self, resource_type, resource_value): + if resource_value.get("PlacementGroup", "NONE") == "NONE": + self.__fail(resource_type, "Placement group is required, set placement_group.") + if "VPCSecurityGroupId" in resource_value: + sg_id = resource_value.get("VPCSecurityGroupId") + self.__validate_efa_sg(resource_type, sg_id) + def validate(self, resource_type, resource_value): # noqa: C901 FIXME """ Validate the given resource. Print an error and exit in case of error. @@ -520,6 +577,9 @@ def validate(self, resource_type, resource_value): # noqa: C901 FIXME # FSX FS Id check elif resource_type in ["fsx_fs_id", "FSx_storage_capacity", "FSx_imported_file_chunk_size", "FSx_export_path"]: self.__validate_fsx_parameters(resource_type, resource_value) + elif resource_type == "EFA": + self.__validate_efa_parameters(resource_type, resource_value) + # Batch Parameters elif resource_type == "AWSBatch_Parameters": # Check region @@ -556,7 +616,7 @@ def validate(self, resource_type, resource_value): # noqa: C901 FIXME if "ComputeInstanceType" in resource_value: compute_instance_type = resource_value["ComputeInstanceType"] try: - supported_instances = get_supported_batch_instances(self.region) + supported_instances = get_supported_features(self.region, "batch").get("instances") if supported_instances: for instance in compute_instance_type.split(","): if not instance.strip() in supported_instances: diff --git a/cli/pcluster/easyconfig.py b/cli/pcluster/easyconfig.py index b89ab4f346..2af4d7a527 100644 --- a/cli/pcluster/easyconfig.py +++ b/cli/pcluster/easyconfig.py @@ -40,6 +40,7 @@ def wrapper(*args, **kwargs): except (BotoCoreError, ClientError) as e: print("Failed with error: %s" % e) print("Hint: please check your AWS credentials.") + print("Run `aws configure` or set the credentials as environment variables.") sys.exit(1) return wrapper @@ -80,7 +81,8 @@ def get_regions(): return [region.get("RegionName") for region in regions if region.get("RegionName") not in unsupported_regions] -def ec2_conn(aws_access_key_id, aws_secret_access_key, aws_region_name): +@handle_client_exception +def ec2_conn(aws_region_name): if aws_region_name: region = aws_region_name elif os.environ.get("AWS_DEFAULT_REGION"): @@ -88,15 +90,13 @@ def ec2_conn(aws_access_key_id, aws_secret_access_key, aws_region_name): else: region = "us-east-1" - ec2 = boto3.client( - "ec2", region_name=region, aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key - ) + ec2 = boto3.client("ec2", region_name=region) return ec2 @handle_client_exception -def list_keys(aws_access_key_id, aws_secret_access_key, aws_region_name): - conn = ec2_conn(aws_access_key_id, aws_secret_access_key, aws_region_name) +def list_keys(aws_region_name): + conn = ec2_conn(aws_region_name) keypairs = conn.describe_key_pairs() keynames = [] for key in keypairs.get("KeyPairs"): @@ -111,8 +111,8 @@ def list_keys(aws_access_key_id, aws_secret_access_key, aws_region_name): @handle_client_exception -def list_vpcs(aws_access_key_id, aws_secret_access_key, aws_region_name): - conn = ec2_conn(aws_access_key_id, aws_secret_access_key, aws_region_name) +def list_vpcs(aws_region_name): + conn = ec2_conn(aws_region_name) vpcs = conn.describe_vpcs() vpcids = [] for vpc in vpcs.get("Vpcs"): @@ -127,8 +127,8 @@ def list_vpcs(aws_access_key_id, aws_secret_access_key, aws_region_name): @handle_client_exception -def list_subnets(aws_access_key_id, aws_secret_access_key, aws_region_name, vpc_id): - conn = ec2_conn(aws_access_key_id, aws_secret_access_key, aws_region_name) +def list_subnets(aws_region_name, vpc_id): + conn = ec2_conn(aws_region_name) subnets = conn.describe_subnets(Filters=[{"Name": "vpcId", "Values": [vpc_id]}]) subnetids = [] for subnet in subnets.get("Subnets"): @@ -161,22 +161,6 @@ def configure(args): # noqa: C901 FIXME!!! "Cluster Template", config.get("global", "cluster_template") if config.has_option("global", "cluster_template") else "default", ) - aws_access_key_id = prompt( - "AWS Access Key ID", - config.get("aws", "aws_access_key_id") if config.has_option("aws", "aws_access_key_id") else None, - True, - ) - aws_secret_access_key = prompt( - "AWS Secret Access Key ID", - config.get("aws", "aws_secret_access_key") if config.has_option("aws", "aws_secret_access_key") else None, - True, - ) - if not aws_access_key_id or not aws_secret_access_key: - print( - "You chose not to configure aws credentials in parallelcluster config file.\n" - "Please make sure you export a valid AWS_PROFILE or you have them exported in " - "the AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY environment variables." - ) # Use built in boto regions as an available option aws_region_name = prompt( @@ -198,13 +182,13 @@ def configure(args): # noqa: C901 FIXME!!! config.get("cluster " + cluster_template, "key_name") if config.has_option("cluster " + cluster_template, "key_name") else None, - options=list_keys(aws_access_key_id, aws_secret_access_key, aws_region_name), + options=list_keys(aws_region_name), check_validity=True, ) vpc_id = prompt( "VPC ID", config.get("vpc " + vpcname, "vpc_id") if config.has_option("vpc " + vpcname, "vpc_id") else None, - options=list_vpcs(aws_access_key_id, aws_secret_access_key, aws_region_name), + options=list_vpcs(aws_region_name), check_validity=True, ) master_subnet_id = prompt( @@ -212,7 +196,7 @@ def configure(args): # noqa: C901 FIXME!!! config.get("vpc " + vpcname, "master_subnet_id") if config.has_option("vpc " + vpcname, "master_subnet_id") else None, - options=list_subnets(aws_access_key_id, aws_secret_access_key, aws_region_name, vpc_id), + options=list_subnets(aws_region_name, vpc_id), check_validity=True, ) @@ -223,12 +207,7 @@ def configure(args): # noqa: C901 FIXME!!! "update_check": "true", "sanity_check": "true", } - s_aws = { - "__name__": "aws", - "aws_access_key_id": aws_access_key_id, - "aws_secret_access_key": aws_secret_access_key, - "aws_region_name": aws_region_name, - } + s_aws = {"__name__": "aws", "aws_region_name": aws_region_name} s_aliases = {"__name__": "aliases", "ssh": "ssh {CFN_USER}@{MASTER_IP} {ARGS}"} s_cluster = {"__name__": "cluster " + cluster_template, "key_name": key_name, "vpc_settings": vpcname} s_vpc = {"__name__": "vpc " + vpcname, "vpc_id": vpc_id, "master_subnet_id": master_subnet_id} diff --git a/cli/pcluster/examples/config b/cli/pcluster/examples/config index fd50b2cbec..6da6acbdba 100644 --- a/cli/pcluster/examples/config +++ b/cli/pcluster/examples/config @@ -8,13 +8,6 @@ update_check = true sanity_check = true [aws] -# This is the AWS credentials section (required). -# These settings apply to all clusters -# replace these with your AWS keys -# If not defined, boto will attempt to use a) environment -# or b) EC2 IAM role. -#aws_access_key_id = #your_aws_access_key_id -#aws_secret_access_key = #your_secret_access_key # Uncomment to specify a different Amazon AWS region (OPTIONAL) # (Defaults to us-east-1 if not defined in environment or below) #aws_region_name = #region diff --git a/cli/pcluster/pcluster.py b/cli/pcluster/pcluster.py index 84b684f78d..1b12c70927 100644 --- a/cli/pcluster/pcluster.py +++ b/cli/pcluster/pcluster.py @@ -34,6 +34,7 @@ import boto3 import pkg_resources from botocore.exceptions import ClientError +from tabulate import tabulate from . import cfnconfig, utils @@ -63,9 +64,9 @@ def create_bucket_with_batch_resources(stack_name, aws_client_config, resources_ return s3_bucket_name -def version(args): +def version(): pcluster_version = pkg_resources.get_distribution("aws-parallelcluster").version - LOGGER.info(pcluster_version) + return pcluster_version def create(args): # noqa: C901 FIXME!!! @@ -116,6 +117,7 @@ def create(args): # noqa: C901 FIXME!!! cfn_params = [{"ParameterKey": key, "ParameterValue": value} for key, value in config.parameters.items()] tags = [{"Key": t, "Value": config.tags[t]} for t in config.tags] + tags.append({"Key": "Version", "Value": version()}) stack = cfn.create_stack( StackName=stack_name, @@ -275,12 +277,12 @@ def update(args): # noqa: C901 FIXME!!! config.parameters["AvailabilityZone"] = availability_zone try: - LOGGER.debug((config.template_url, config.parameters)) + LOGGER.debug(config.parameters) cfn_params = [{"ParameterKey": key, "ParameterValue": value} for key, value in config.parameters.items()] LOGGER.info("Calling update_stack") cfn.update_stack( - StackName=stack_name, TemplateURL=config.template_url, Parameters=cfn_params, Capabilities=capabilities + StackName=stack_name, UsePreviousTemplate=True, Parameters=cfn_params, Capabilities=capabilities ) status = cfn.describe_stacks(StackName=stack_name).get("Stacks")[0].get("StackStatus") if not args.nowait: @@ -326,8 +328,9 @@ def start(args): if config.parameters.get("MinSize") and int(config.parameters.get("MinSize")) > 0 else 0 ) + ce_name = get_batch_ce(stack_name, config) start_batch_ce( - ce_name=stack_name, config=config, min_vcpus=min_vcpus, desired_vcpus=desired_vcpus, max_vcpus=max_vcpus + ce_name=ce_name, config=config, min_vcpus=min_vcpus, desired_vcpus=desired_vcpus, max_vcpus=max_vcpus ) else: LOGGER.info("Starting compute fleet : %s", args.cluster_name) @@ -362,7 +365,8 @@ def stop(args): if config.parameters.get("Scheduler") == "awsbatch": LOGGER.info("Disabling AWS Batch compute environment : %s", args.cluster_name) - stop_batch_ce(ce_name=stack_name, config=config) + ce_name = get_batch_ce(stack_name, config) + stop_batch_ce(ce_name=ce_name, config=config) else: LOGGER.info("Stopping compute fleet : %s", args.cluster_name) # Set Resource limits @@ -370,6 +374,55 @@ def stop(args): set_asg_limits(asg_name=asg_name, config=config, min=0, max=0, desired=0) +def get_batch_ce(stack_name, config): + """ + Get name of the AWS Batch Compute Environment. + + :param stack_name: name of the master stack + :param config: config + :return: ce_name or exit if not found + """ + cfn = boto3.client( + "cloudformation", + region_name=config.region, + aws_access_key_id=config.aws_access_key_id, + aws_secret_access_key=config.aws_secret_access_key, + ) + + try: + outputs = cfn.describe_stacks(StackName=stack_name).get("Stacks")[0].get("Outputs") + return _get_output_value(outputs, "BatchComputeEnvironmentArn") + except ClientError as e: + LOGGER.critical(e.response.get("Error").get("Message")) + sys.exit(1) + + +def get_version(stack): + """ + Get the version of the stack if tagged. + + :param stack: stack object + :return: version or empty string + """ + return next((tag.get("Value") for tag in stack.get("Tags") if tag.get("Key") == "Version"), "") + + +def colorize(stack_status, args): + """ + Color the output, COMPLETE = green, FAILED = red, IN_PROGRESS = yellow. + + :param status: stack status + :return: colorized status string + """ + if not args.color: + return stack_status + end = "0m" + status_to_color = {"COMPLETE": "0;32m", "FAILED": "0;31m", "IN_PROGRESS": "10;33m"} + for status in status_to_color: + if status in stack_status: + return "\033[%s%s\033[%s" % (status_to_color[status], stack_status, end) + + def list_stacks(args): config = cfnconfig.ParallelClusterConfig(args) cfn = boto3.client( @@ -380,9 +433,18 @@ def list_stacks(args): ) try: stacks = cfn.describe_stacks().get("Stacks") + result = [] for stack in stacks: if stack.get("ParentId") is None and stack.get("StackName").startswith("parallelcluster-"): - LOGGER.info("%s", stack.get("StackName")[len("parallelcluster-") :]) # noqa: E203 + pcluster_version = get_version(stack) + result.append( + [ + stack.get("StackName")[len("parallelcluster-") :], # noqa: E203 + colorize(stack.get("StackStatus"), args), + pcluster_version, + ] + ) + LOGGER.info(tabulate(result, tablefmt="plain")) except ClientError as e: LOGGER.critical(e.response.get("Error").get("Message")) sys.exit(1) diff --git a/cli/pcluster/resources/batch/custom_resources_code/deregister_batch_mnp_job_definitions.py b/cli/pcluster/resources/batch/custom_resources_code/deregister_batch_mnp_job_definitions.py deleted file mode 100644 index 8de1e82c38..0000000000 --- a/cli/pcluster/resources/batch/custom_resources_code/deregister_batch_mnp_job_definitions.py +++ /dev/null @@ -1,111 +0,0 @@ -# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance with -# the License. A copy of the License is located at -# -# http://aws.amazon.com/apache2.0/ -# -# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES -# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and -# limitations under the License. -import re -import time - -import boto3 -from botocore.exceptions import ClientError - -import crhelper - -# initialise logger -logger = crhelper.log_config({"RequestId": "CONTAINER_INIT"}, loglevel="info") -logger.info("Logging configured") -# set global to track init failures -init_failed = False - -try: - # Place initialization code here - logger.info("Container initialization completed") - batch_client = boto3.client("batch") -except Exception as e: - logger.error(e, exc_info=True) - init_failed = e - - -def get_job_definition_name_by_arn(job_definition_arn): - """ - Parse Job Definition arn and get name. - - Args: - job_definition_arn: something like arn:aws:batch:::job-definition/: - - Returns: the job definition name - """ - pattern = r".*/(.*):(.*)" - return re.search(pattern, job_definition_arn).group(1) - - -def retrieve_job_definition_revisions(name): - """ - Retrieve all revisions for a given job definition. - - Args: - name: name of the job definition - - Returns: an array containing all job definition revisions ARNs - """ - next_token = "" - job_definitions = [] - while next_token is not None: - response = batch_client.describe_job_definitions(jobDefinitionName=name, nextToken=next_token, status="ACTIVE") - if "jobDefinitions" in response: - for job_definition in response["jobDefinitions"]: - job_definitions.append(job_definition["jobDefinitionArn"]) - next_token = response.get("nextToken") - # Since it's not a time critical operation, sleeping to avoid hitting API's TPS limit. - time.sleep(0.5) - - return job_definitions - - -def deregister_job_definition_revisions(name): - """ - De-register all revisions belonging to a given job definition. - - Args: - name: name of the job definition - """ - job_definitions = retrieve_job_definition_revisions(name) - for job_definition in job_definitions: - try: - logger.info("De-registering job definition: %s" % job_definition) - batch_client.deregister_job_definition(jobDefinition=job_definition) - except ClientError: - logger.warning("job definition not found: %s. It was probably manually de-registered." % job_definition) - # Since it's not a time critical operation, sleeping to avoid hitting API's TPS limit. - time.sleep(0.5) - - -def create(event, context): - """Noop.""" - return "MNPJobDefinitionCleanupHandler", {} - - -def update(event, context): - """Noop.""" - return event["MNPJobDefinitionCleanupHandler"], {} - - -def delete(event, context): - """Deregister all mnp job definitions.""" - job_definition = get_job_definition_name_by_arn(event["ResourceProperties"]["JobDefinitionMNPArn"]) - logger.info("Job definition %s deletion: STARTED" % job_definition) - deregister_job_definition_revisions(job_definition) - logger.info("Job definition %s deletion: COMPLETED" % job_definition) - - -def handler(event, context): - """Main handler function, passes off it's work to crhelper's cfn_handler.""" # noqa: D401 - # update the logger with event info - global logger - logger = crhelper.log_config(event, loglevel="info") - return crhelper.cfn_handler(event, context, create, update, delete, logger, init_failed) diff --git a/cli/pcluster/utils.py b/cli/pcluster/utils.py index a49d031c9c..b282a7ec42 100644 --- a/cli/pcluster/utils.py +++ b/cli/pcluster/utils.py @@ -131,21 +131,37 @@ def _get_json_from_s3(region, file_name): return json.loads(file_contents) -def get_supported_batch_instances(region): +def get_supported_features(region, feature): """ - Get a json object containing the instances supported by batch. + Get a json object containing the attributes supported by a feature, for example. + + { + "Features": { + "efa": { + "instances": ["c5n.18xlarge", "p3dn.24xlarge", "i3en.24xlarge"], + "baseos": ["alinux", "centos7"], + "schedulers": ["sge", "slurm", "torque"] + }, + "awsbatch": { + "instances": ["r3.8xlarge", ..., "m5.4xlarge"] + } + } + } :param region: AWS Region - :param instance_type: the instance type to search for. - :return: json object containing the instances supported by batch - or an empty object if unable to parse/get the instance list file + :param feature: the feature to search for, i.e. "efa" "awsbatch" + :return: json object containing all the attributes supported by feature """ try: - instances = _get_json_from_s3(region, "instances/batch_instances.json") - except (ValueError, ClientError): - instances = "" - - return instances + features = _get_json_from_s3(region, "features/feature_whitelist.json") + supported_features = features.get("Features").get(feature) + except (ValueError, ClientError, KeyError): + print( + "Failed validate %s. This is probably a bug on our end. Please set sanity_check = false and retry" % feature + ) + exit(1) + + return supported_features def get_instance_vcpus(region, instance_type): diff --git a/cli/setup.py b/cli/setup.py index b744e7cc3e..857e19ba1e 100644 --- a/cli/setup.py +++ b/cli/setup.py @@ -21,7 +21,7 @@ def readme(): return f.read() -VERSION = "2.3.1" +VERSION = "2.4.0" REQUIRES = ["boto3>=1.9.54", "future>=0.16.0,<=0.17.1", "tabulate>=0.8.2,<=0.8.3"] if sys.version_info[:2] == (2, 6): diff --git a/cli/tox.ini b/cli/tox.ini index 565059f727..af4a7efa4a 100644 --- a/cli/tox.ini +++ b/cli/tox.ini @@ -9,7 +9,6 @@ envlist = # Default testenv. Used to run tests on all python versions. [testenv] -# deps = -rtests/requirements.txt TODO: ADD UNIT TESTS whitelist_externals = bash deps = -rtests/requirements.txt @@ -61,7 +60,9 @@ commands = [testenv:isort] basepython = python3 skip_install = true -deps = isort +deps = + isort + seed-isort-config commands = isort -rc -w 120 \ {[vars]code_dirs} \ diff --git a/cloudformation/aws-parallelcluster.cfn.json b/cloudformation/aws-parallelcluster.cfn.json index aae3381aba..ce46f6efaa 100644 --- a/cloudformation/aws-parallelcluster.cfn.json +++ b/cloudformation/aws-parallelcluster.cfn.json @@ -1,6 +1,6 @@ { "AWSTemplateFormatVersion": "2010-09-09", - "Description": "AWS ParallelCluster Template. Version: aws-parallelcluster-2.3.1", + "Description": "AWS ParallelCluster Template. Version: aws-parallelcluster-2.4.0", "Metadata": { "AWS::CloudFormation::Interface": { "ParameterGroups": [ @@ -598,6 +598,11 @@ "Description": "Comma separated list of efs related options, 8 parameters in total, [shared_dir,efs_fs_id,performance_mode,efs_kms_key_id,provisioned_throughput,encrypted,throughput_mode,valid_existing_MTorNot]", "Type": "String", "Default": "NONE, NONE, NONE, NONE, NONE, NONE, NONE, NONE" + }, + "EFA": { + "Description": "Enable EFA on the compute nodes, enable_efa = compute", + "Type": "String", + "Default": "NONE" } }, "Conditions": { @@ -1118,26 +1123,16 @@ "aws-us-gov" ] }, - "ChinaRegion": { - "Fn::Equals": [ - { - "Ref": "AWS::Partition" - }, - "aws-cn" - ] - }, "CreateLaunchTemplate": { "Fn::And": [ { "Fn::Not": [ { - "Fn::Or": [ + "Fn::Equals": [ { - "Condition": "ChinaRegion" + "Ref": "AWS::Region" }, - { - "Condition": "GovCloudRegion" - } + "us-gov-east-1" ] } ] @@ -1150,13 +1145,11 @@ "CreateLaunchConfig": { "Fn::And": [ { - "Fn::Or": [ + "Fn::Equals": [ { - "Condition": "ChinaRegion" + "Ref": "AWS::Region" }, - { - "Condition": "GovCloudRegion" - } + "us-gov-east-1" ] }, { @@ -1207,146 +1200,160 @@ ] } ] + }, + "EnableEFA": { + "Fn::Not": [ + { + "Fn::Equals": [ + { + "Ref": "EFA" + }, + "NONE" + ] + } + ] } }, "Mappings": { "AWSRegionOS2AMI": { "ap-northeast-1": { - "alinux": "ami-0af8c1a29f58c3b91", - "centos6": "ami-0476984f547d1f4f2", - "centos7": "ami-0f13f45e966236e46", - "ubuntu1404": "ami-0ce1c5516c087ef8d", - "ubuntu1604": "ami-041f6050eff86f024" + "alinux": "ami-0dcc18768374b4441", + "centos6": "ami-086781b933db101a5", + "centos7": "ami-09bae677f8f58842d", + "ubuntu1404": "ami-0939e3e1030d4f7d2", + "ubuntu1604": "ami-06b328a6ee03ccdf4" }, "ap-northeast-2": { - "alinux": "ami-036c289fda8701f9d", - "centos6": "ami-06ecb1e81881cd450", - "centos7": "ami-016c726d8902d133c", - "ubuntu1404": "ami-0744c53e9582abcd4", - "ubuntu1604": "ami-0df4c1dafbfee5031" + "alinux": "ami-022e7c66ccb807c9f", + "centos6": "ami-07d646c87d889d816", + "centos7": "ami-0eeb6c96d0e6c2d90", + "ubuntu1404": "ami-0481c6b023e2328b4", + "ubuntu1604": "ami-0179e2707f709f813" }, "ap-northeast-3": { - "alinux": "ami-000902aa3082732ce", - "centos6": "ami-04d195b55ddf56228", - "centos7": "ami-037c3a13cd142c8f8", - "ubuntu1404": "ami-0d0faa548bcca5fac", - "ubuntu1604": "ami-08d3ef362e1d06e56" + "alinux": "ami-04402be7b85999df8", + "centos6": "ami-082ece6e5fe8f6fd1", + "centos7": "ami-084c0dbc04f722758", + "ubuntu1404": "ami-0a535e1d0bb7bc502", + "ubuntu1604": "ami-0c9b72bae5efc9f61" }, "ap-south-1": { - "alinux": "ami-00ff6216daa4b0a69", - "centos6": "ami-0b1abd2bf8810487c", - "centos7": "ami-06b7212503b9d9637", - "ubuntu1404": "ami-00721e9f7f8235dba", - "ubuntu1604": "ami-0ef148f6ae69767d7" + "alinux": "ami-0a14b1f0e7427a4bb", + "centos6": "ami-02389426198baf430", + "centos7": "ami-031f8f67a53de53fe", + "ubuntu1404": "ami-000e99acc047832ae", + "ubuntu1604": "ami-0f21d1eb3339ebd6a" }, "ap-southeast-1": { - "alinux": "ami-03b015a13daa9ff8d", - "centos6": "ami-0576b4b2db8272abf", - "centos7": "ami-0c39937e9ae643ecd", - "ubuntu1404": "ami-03df9d0a89a448c63", - "ubuntu1604": "ami-0b63a13236ce5b8d9" + "alinux": "ami-02079735c20c1ac4e", + "centos6": "ami-02105387481bd0ad0", + "centos7": "ami-041ca5c2f5b748966", + "ubuntu1404": "ami-09ca9a6a8fee71ba5", + "ubuntu1604": "ami-01899e9a659eb2267" }, "ap-southeast-2": { - "alinux": "ami-0c2528255cc7c4cec", - "centos6": "ami-09a18baab0a142123", - "centos7": "ami-0164dbfb6b7b938f5", - "ubuntu1404": "ami-06116e2159f6ba6bf", - "ubuntu1604": "ami-0f5a3072f23556b07" + "alinux": "ami-0c65952cdec26ae39", + "centos6": "ami-0050fad9761b3957c", + "centos7": "ami-06c7f5584ecfcac3a", + "ubuntu1404": "ami-09646cc49a932a37e", + "ubuntu1604": "ami-049c81a79d55b2c8a" }, "ca-central-1": { - "alinux": "ami-05bad5df22b9502e5", - "centos6": "ami-0aa03a3f1b737c651", - "centos7": "ami-0ee7cb4d2673e78de", - "ubuntu1404": "ami-0d180013cf3d07fc9", - "ubuntu1604": "ami-0c88262f6fd2738fc" + "alinux": "ami-01f28f8381746746f", + "centos6": "ami-0e70755a47200df23", + "centos7": "ami-0afc2ea67b3963398", + "ubuntu1404": "ami-06ac5db73837bc364", + "ubuntu1604": "ami-0b8928a1f643684eb" }, "cn-north-1": { - "alinux": "ami-0227bedfc6798cba1", - "ubuntu1404": "ami-0ef85bbc4ba66c301", - "ubuntu1604": "ami-017ea2a40c48f9af4" + "alinux": "ami-0da67c26ce2e8d111", + "ubuntu1404": "ami-07e16a5709c99f963", + "ubuntu1604": "ami-0ae967dc97d5eb57a" }, "cn-northwest-1": { - "alinux": "ami-08143603c5f390f20" + "alinux": "ami-03dc8f759de9de690", + "ubuntu1404": "ami-05348579489ba3673", + "ubuntu1604": "ami-0ba0b1ed49ce7b1b1" }, "eu-central-1": { - "alinux": "ami-003262ea853b26050", - "centos6": "ami-092bd9c46746d940b", - "centos7": "ami-0bcced571d9cc0142", - "ubuntu1404": "ami-04b116ae9a44c861f", - "ubuntu1604": "ami-06a21b6e0815065a4" + "alinux": "ami-0ff6d2a86b9199e82", + "centos6": "ami-03979ebb9cfee2ccc", + "centos7": "ami-0205eaef48a9fc97a", + "ubuntu1404": "ami-0032889c720d364dc", + "ubuntu1604": "ami-002422c65a5bb1af8" }, "eu-north-1": { - "alinux": "ami-06cac8aed0729f14c", - "centos6": "ami-07b83433077d8345b", - "centos7": "ami-00255a59ce6bd8147", - "ubuntu1404": "ami-0de1c666987bbdb1f", - "ubuntu1604": "ami-0418320f06192d788" + "alinux": "ami-0cb08caa10d113ed7", + "centos6": "ami-085a9ecbf9f64f65b", + "centos7": "ami-0420576e18a5fcb7c", + "ubuntu1404": "ami-0976908358f0bfa01", + "ubuntu1604": "ami-0d3c7ce730c73ab00" }, "eu-west-1": { - "alinux": "ami-0691d6d6d4d209e09", - "centos6": "ami-09880c7e25df69af8", - "centos7": "ami-00c07933e0ea22f7d", - "ubuntu1404": "ami-01b114f6a268d6a42", - "ubuntu1604": "ami-0809bc00666e41cfa" + "alinux": "ami-0b5c32b12b9c340d0", + "centos6": "ami-070ba56e38a744df5", + "centos7": "ami-0f67868de5be7b0b3", + "ubuntu1404": "ami-0f5c65a609ad3afb4", + "ubuntu1604": "ami-00328873639859269" }, "eu-west-2": { - "alinux": "ami-0d241a5c57ee3421d", - "centos6": "ami-0eba961d9f30431b2", - "centos7": "ami-09aa34259643c50eb", - "ubuntu1404": "ami-0f9ad3c001b80325a", - "ubuntu1604": "ami-04d8578267aaa2ac4" + "alinux": "ami-0c218c2aaa7185f03", + "centos6": "ami-08553013e6e986028", + "centos7": "ami-057fa1a5314e3c414", + "ubuntu1404": "ami-08c2d96c2805037e7", + "ubuntu1604": "ami-0c1de72c6acf4b187" }, "eu-west-3": { - "alinux": "ami-0e59dd1d2794a857c", - "centos6": "ami-0d0b243ac76765544", - "centos7": "ami-04ce6f74e1070a795", - "ubuntu1404": "ami-0f921986737ab8306", - "ubuntu1604": "ami-02de781189ccb9f92" + "alinux": "ami-011e0eee21d52f23e", + "centos6": "ami-0afff5bc147c847e0", + "centos7": "ami-05b2808c2dc4fb82c", + "ubuntu1404": "ami-0f6cd6ac9be8f2b32", + "ubuntu1604": "ami-090d577bb6d08e95b" }, "sa-east-1": { - "alinux": "ami-07b044055a13cf93e", - "centos6": "ami-0dfdc6ab8bf7935ea", - "centos7": "ami-0a625e9dcf563db57", - "ubuntu1404": "ami-0d1d30ad051235185", - "ubuntu1604": "ami-088d6a838e8dc6b11" + "alinux": "ami-0d154ae55458941fd", + "centos6": "ami-0635a9bdc378fe67f", + "centos7": "ami-0da1262e3c5d9af72", + "ubuntu1404": "ami-0d0da341da4802af9", + "ubuntu1604": "ami-08df8912b098a3f42" }, "us-east-1": { - "alinux": "ami-0f8b01b1377483305", - "centos6": "ami-00f71e3be938f3077", - "centos7": "ami-0658a809b3e89b0c9", - "ubuntu1404": "ami-0422aa8ec2e452870", - "ubuntu1604": "ami-0a8c4ea1bd1ff7651" + "alinux": "ami-0d130bdfab2037f8a", + "centos6": "ami-091f37e900368fe1a", + "centos7": "ami-031eb9c5390c0f8f6", + "ubuntu1404": "ami-017bfe181606779d8", + "ubuntu1604": "ami-08e1d33a6a64499de" }, "us-east-2": { - "alinux": "ami-049afa5b53a7880d8", - "centos6": "ami-0b29637d31cf774aa", - "centos7": "ami-07cef254f8886ea4e", - "ubuntu1404": "ami-02447e477105886bd", - "ubuntu1604": "ami-04d5c390495e0509f" + "alinux": "ami-00d2a10466c577ac7", + "centos6": "ami-055404b3df678da86", + "centos7": "ami-0050bd80a1cecfe37", + "ubuntu1404": "ami-043eb896e1bb2b948", + "ubuntu1604": "ami-0219fdb6f47395d88" }, "us-gov-east-1": { - "alinux": "ami-02ee5c66a10526bd1", - "ubuntu1404": "ami-03538e53996b83762", - "ubuntu1604": "ami-0bfb76fbbbb68030d" + "alinux": "ami-0f5003922daf22962", + "ubuntu1404": "ami-060ced48ab370aadf", + "ubuntu1604": "ami-0af2c8e5bf3c334b0" }, "us-gov-west-1": { - "alinux": "ami-7da7d01c", - "ubuntu1404": "ami-90a2d5f1", - "ubuntu1604": "ami-eeaed98f" + "alinux": "ami-ba83fbdb", + "ubuntu1404": "ami-32f98153", + "ubuntu1604": "ami-7b85fd1a" }, "us-west-1": { - "alinux": "ami-02c87842ea944292e", - "centos6": "ami-08dc392067bcf9807", - "centos7": "ami-0454b933360a077e4", - "ubuntu1404": "ami-0f4a99f972b9b4882", - "ubuntu1604": "ami-0a33d79d5f920cc2c" + "alinux": "ami-0b6f7961ee845966e", + "centos6": "ami-0e438402399c457d7", + "centos7": "ami-09bd008b253048b80", + "ubuntu1404": "ami-0d48f8a9d5735efde", + "ubuntu1604": "ami-066818f6a6be06fb5" }, "us-west-2": { - "alinux": "ami-09b457d5cba24514a", - "centos6": "ami-0fa309858f6ce66ee", - "centos7": "ami-03b7e311ae2f4aacb", - "ubuntu1404": "ami-04caeb57df33aba89", - "ubuntu1604": "ami-00050b3048393bc12" + "alinux": "ami-0d611d90619419e93", + "centos6": "ami-0651b7e7cfde4b3a0", + "centos7": "ami-003da28849bc413f5", + "ubuntu1404": "ami-0169da6ccb6347f50", + "ubuntu1604": "ami-07122cb5a96b7fee9" } }, "OSFeatures": { @@ -1384,8 +1391,8 @@ }, "PackagesVersions": { "default": { - "parallelcluster": "2.3.1", - "cookbook": "aws-parallelcluster-cookbook-2.3.1", + "parallelcluster": "2.4.0", + "cookbook": "aws-parallelcluster-cookbook-2.4.0", "chef": "14.2.0", "ridley": "5.1.1", "berkshelf": "7.0.4", @@ -1547,6 +1554,7 @@ }, "DynamoDBTable": { "Type": "AWS::DynamoDB::Table", + "UpdateReplacePolicy": "Retain", "Properties": { "AttributeDefinitions": [ { @@ -1707,7 +1715,8 @@ "autoscaling:TerminateInstanceInAutoScalingGroup", "autoscaling:SetDesiredCapacity", "autoscaling:UpdateAutoScalingGroup", - "autoscaling:DescribeTags" + "autoscaling:DescribeTags", + "autoscaling:SetInstanceHealth" ], "Effect": "Allow", "Resource": [ @@ -2070,11 +2079,17 @@ ] } }, + { + "Key": "aws-parallelcluster-networking", + "Value": { + "Fn::Sub": "EFA=${EFA}" + } + }, { "Key": "aws-parallelcluster-filesystem", "Value": { "Fn::Sub": [ - "efs=${efs}, multiebs=${NumberOfEBSVol}, raid=${raid}", + "efs=${efs}, multiebs=${NumberOfEBSVol}, raid=${raid}, fsx=${fsx}", { "efs": { "Fn::If": [ @@ -2089,6 +2104,13 @@ "1", "0" ] + }, + "fsx": { + "Fn::If": [ + "CreateFSXSubstack", + "1", + "0" + ] } } ] @@ -2225,7 +2247,7 @@ " if [ \"${apt}\" == \"0\" ]; then\n", " apt-cache search build-essential; apt-get clean; apt-get update; apt-get -y install build-essential curl wget jq\n", " fi\n", - " [[ ${_region} =~ ^cn- ]] && s3_url=\"cn-north-1.amazonaws.com.cn\"", + " [[ ${_region} =~ ^cn- ]] && s3_url=\"cn-north-1.amazonaws.com.cn/cn-north-1-aws-parallelcluster\"", " which cfn-init 2>/dev/null || ( curl -s -L -o /tmp/aws-cfn-bootstrap-latest.tar.gz https://s3.${s3_url}/cloudformation-examples/aws-cfn-bootstrap-latest.tar.gz; easy_install -U /tmp/aws-cfn-bootstrap-latest.tar.gz)\n", " mkdir -p /etc/chef && chown -R root:root /etc/chef\n", " curl --retry 3 -L https://www.chef.io/chef/install.sh | bash -s -- -v ${chef_version}\n", @@ -2382,7 +2404,8 @@ "chefPrepEnv", "shellRunPreInstall", "chefConfig", - "shellRunPostInstall" + "shellRunPostInstall", + "chefFinalize" ] }, "deployConfigFiles": { @@ -2396,6 +2419,9 @@ "stack_name": { "Ref": "AWS::StackName" }, + "enable_efa": { + "Ref": "EFA" + }, "cfn_raid_vol_ids": { "Fn::If": [ "CreateRAIDSubstack", @@ -2593,6 +2619,14 @@ "command": "/opt/parallelcluster/scripts/fetch_and_run -postinstall" } } + }, + "chefFinalize": { + "commands": { + "chef": { + "command": "chef-client --local-mode --config /etc/chef/client.rb --log_level auto --force-formatter --no-color --chef-zero-port 8889 --json-attributes /etc/chef/dna.json --override-runlist aws-parallelcluster::finalize", + "cwd": "/etc/chef" + } + } } } }, @@ -2707,6 +2741,13 @@ }, "PropagateAtLaunch": true }, + { + "Key": "aws-parallelcluster-networking", + "Value": { + "Fn::Sub": "EFA=${EFA}" + }, + "PropagateAtLaunch": true + }, { "Key": "aws-parallelcluster-filesystem", "Value": { @@ -3016,15 +3057,22 @@ "#!/bin/bash -x\n\n", "function error_exit\n", "{\n", - " cfn-signal ${proxy_args} --exit-code=1 --reason=\"$1\" --stack=", - { - "Ref": "AWS::StackName" - }, - " --resource=ComputeFleet --region=", + " region=", { "Ref": "AWS::Region" }, "\n", + " instance_id=$(curl --retry 3 --retry-delay 0 --silent --fail http://169.254.169.254/latest/meta-data/instance-id)\n", + " log_dir=/home/logs/compute\n", + " mkdir -p ${log_dir}\n", + " echo \"Reporting instance as unhealthy and dumping logs to ${log_dir}/${instance_id}.tar.gz\"\n", + " tar -czf ${log_dir}/${instance_id}.tar.gz /var/log\n", + " aws --region ${region} autoscaling set-instance-health --instance-id ${instance_id} --health-status Unhealthy\n", + " cfn-signal ${proxy_args} --exit-code=1 --reason=\"$1\" --stack=", + { + "Ref": "AWS::StackName" + }, + " --resource=ComputeFleet --region=${region}\n", " exit 1\n", "}\n", "function vendor_cookbook\n", @@ -3047,7 +3095,7 @@ " if [ \"${apt}\" == \"0\" ]; then\n", " apt-cache search build-essential; apt-get clean; apt-get update; apt-get -y install build-essential curl wget jq\n", " fi\n", - " [[ ${_region} =~ ^cn- ]] && s3_url=\"cn-north-1.amazonaws.com.cn\"", + " [[ ${_region} =~ ^cn- ]] && s3_url=\"cn-north-1.amazonaws.com.cn/cn-north-1-aws-parallelcluster\"", " which cfn-init 2>/dev/null || ( curl -s -L -o /tmp/aws-cfn-bootstrap-latest.tar.gz https://s3.${s3_url}/cloudformation-examples/aws-cfn-bootstrap-latest.tar.gz; easy_install -U /tmp/aws-cfn-bootstrap-latest.tar.gz)\n", " mkdir -p /etc/chef && chown -R root:root /etc/chef\n", " curl --retry 3 -L https://www.chef.io/chef/install.sh | bash -s -- -v ${chef_version}\n", @@ -3179,7 +3227,7 @@ "Ref": "AWS::Region" }, " || error_exit 'Failed to run cfn-init. If --norollback was specified, check /var/log/cfn-init.log and /var/log/cloud-init-output.log.'\n", - "cfn-signal ${proxy_args} --exit-code=0 --reason=\"MasterServer setup complete\" --stack=", + "cfn-signal ${proxy_args} --exit-code=0 --reason=\"ComputeServer setup complete\" --stack=", { "Ref": "AWS::StackName" }, @@ -3205,7 +3253,7 @@ "shellRunPreInstall", "chefConfig", "shellRunPostInstall", - "signalComputeReady" + "chefFinalize" ] }, "deployConfigFiles": { @@ -3219,6 +3267,9 @@ "stack_name": { "Ref": "AWS::StackName" }, + "enable_efa": { + "Ref": "EFA" + }, "cfn_raid_parameters": { "Ref": "RAIDOptions" }, @@ -3391,10 +3442,11 @@ } } }, - "signalComputeReady": { + "chefFinalize": { "commands": { - "compute_ready": { - "command": "/opt/parallelcluster/scripts/compute_ready" + "chef": { + "command": "chef-client --local-mode --config /etc/chef/client.rb --log_level auto --force-formatter --no-color --chef-zero-port 8889 --json-attributes /etc/chef/dna.json --override-runlist aws-parallelcluster::finalize", + "cwd": "/etc/chef" } } } @@ -3408,6 +3460,15 @@ "NetworkInterfaces": [ { "DeviceIndex": 0, + "InterfaceType": { + "Fn::If": [ + "EnableEFA", + "efa", + { + "Ref": "AWS::NoValue" + } + ] + }, "Groups": [ { "Fn::If": [ @@ -3679,15 +3740,22 @@ "#!/bin/bash -x\n\n", "function error_exit\n", "{\n", - " cfn-signal ${proxy_args} --exit-code=1 --reason=\"$1\" --stack=", - { - "Ref": "AWS::StackName" - }, - " --resource=ComputeFleet --region=", + " region=", { "Ref": "AWS::Region" }, "\n", + " instance_id=$(curl --retry 3 --retry-delay 0 --silent --fail http://169.254.169.254/latest/meta-data/instance-id)\n", + " log_dir=/home/logs/compute\n", + " mkdir -p ${log_dir}\n", + " echo \"Reporting instance as unhealthy and dumping logs to ${log_dir}/${instance_id}.tar.gz\"\n", + " tar -czf ${log_dir}/${instance_id}.tar.gz /var/log\n", + " aws --region ${region} autoscaling set-instance-health --instance-id ${instance_id} --health-status Unhealthy\n", + " cfn-signal ${proxy_args} --exit-code=1 --reason=\"$1\" --stack=", + { + "Ref": "AWS::StackName" + }, + " --resource=ComputeFleet --region=${region}\n", " exit 1\n", "}\n", "function vendor_cookbook\n", @@ -3710,7 +3778,7 @@ " if [ \"${apt}\" == \"0\" ]; then\n", " apt-cache search build-essential; apt-get clean; apt-get update; apt-get -y install build-essential curl wget jq\n", " fi\n", - " [[ ${_region} =~ ^cn- ]] && s3_url=\"cn-north-1.amazonaws.com.cn\"", + " [[ ${_region} =~ ^cn- ]] && s3_url=\"cn-north-1.amazonaws.com.cn/cn-north-1-aws-parallelcluster\"", " which cfn-init 2>/dev/null || ( curl -s -L -o /tmp/aws-cfn-bootstrap-latest.tar.gz https://s3.${s3_url}/cloudformation-examples/aws-cfn-bootstrap-latest.tar.gz; easy_install -U /tmp/aws-cfn-bootstrap-latest.tar.gz)\n", " mkdir -p /etc/chef && chown -R root:root /etc/chef\n", " curl --retry 3 -L https://www.chef.io/chef/install.sh | bash -s -- -v ${chef_version}\n", @@ -3842,7 +3910,7 @@ "Ref": "AWS::Region" }, " || error_exit 'Failed to run cfn-init. If --norollback was specified, check /var/log/cfn-init.log and /var/log/cloud-init-output.log.'\n", - "cfn-signal ${proxy_args} --exit-code=0 --reason=\"MasterServer setup complete\" --stack=", + "cfn-signal ${proxy_args} --exit-code=0 --reason=\"ComputeServer setup complete\" --stack=", { "Ref": "AWS::StackName" }, @@ -3870,7 +3938,7 @@ "shellRunPreInstall", "chefConfig", "shellRunPostInstall", - "signalComputeReady" + "chefFinalize" ] }, "deployConfigFiles": { @@ -3884,6 +3952,9 @@ "stack_name": { "Ref": "AWS::StackName" }, + "enable_efa": { + "Ref": "EFA" + }, "cfn_raid_parameters": { "Ref": "RAIDOptions" }, @@ -4056,10 +4127,11 @@ } } }, - "signalComputeReady": { + "chefFinalize": { "commands": { - "compute_ready": { - "command": "/opt/parallelcluster/scripts/compute_ready" + "chef": { + "command": "chef-client --local-mode --config /etc/chef/client.rb --log_level auto --force-formatter --no-color --chef-zero-port 8889 --json-attributes /etc/chef/dna.json --override-runlist aws-parallelcluster::finalize", + "cwd": "/etc/chef" } } } @@ -4188,6 +4260,35 @@ }, "Condition": "CreateSecurityGroups" }, + "ComputeSecurityGroupEgress": { + "Type": "AWS::EC2::SecurityGroupEgress", + "Properties": { + "IpProtocol": "-1", + "FromPort": 0, + "ToPort": 65535, + "DestinationSecurityGroupId": { + "Ref": "ComputeSecurityGroup" + }, + "GroupId": { + "Ref": "ComputeSecurityGroup" + } + }, + "Condition": "CreateSecurityGroups" + }, + "ComputeSecurityGroupNormalEgress": { + "Type": "AWS::EC2::SecurityGroupEgress", + "Properties": { + "IpProtocol": "-1", + "FromPort": 0, + "ToPort": 65535, + "CidrIp": "0.0.0.0/0", + "GroupId": { + "Ref": "ComputeSecurityGroup" + } + }, + "DependsOn": "ComputeSecurityGroupEgress", + "Condition": "CreateSecurityGroups" + }, "ComputeSecurityGroupIngress": { "Type": "AWS::EC2::SecurityGroupIngress", "Properties": { @@ -4753,8 +4854,7 @@ "MasterServer", "PublicIp" ] - }, - "Condition": "MasterPublicIp" + } }, "GangliaPrivateURL": { "Description": "Private URL to access Ganglia (disabled by default)", @@ -4790,8 +4890,7 @@ "/ganglia/" ] ] - }, - "Condition": "MasterPublicIp" + } }, "ResourcesS3Bucket": { "Description": "S3 user bucket where AWS ParallelCluster resources are stored", diff --git a/cloudformation/batch-substack.cfn.json b/cloudformation/batch-substack.cfn.json index 694d1dd74d..b7cbd97211 100644 --- a/cloudformation/batch-substack.cfn.json +++ b/cloudformation/batch-substack.cfn.json @@ -238,7 +238,6 @@ "Effect": "Allow", "Action": [ "batch:SubmitJob", - "batch:RegisterJobDefinition", "cloudformation:DescribeStacks", "ecs:ListContainerInstances", "ecs:DescribeContainerInstances", @@ -254,24 +253,7 @@ "Fn::Sub": "${JobDefinitionSerial}" }, { - "Fn::Sub": [ - "${MNPJobDefinitionArn}*", - { - "MNPJobDefinitionArn": { - "Fn::Select": [ - "0", - { - "Fn::Split": [ - ":1END_OF_THE_STRING", - { - "Fn::Sub": "${JobDefinitionMNP}END_OF_THE_STRING" - } - ] - } - ] - } - } - ] + "Fn::Sub": "${JobDefinitionMNP}" }, { "Fn::Sub": "${JobQueue}" @@ -1010,101 +992,6 @@ } ] } - }, - "DeregisterBatchMNPJobDefinitionsCustomResource": { - "Type": "AWS::CloudFormation::CustomResource", - "Properties": { - "JobDefinitionMNPArn": { - "Ref": "JobDefinitionMNP" - }, - "ServiceToken": { - "Fn::GetAtt": [ - "DeregisterBatchMNPJobDefinitionsFunction", - "Arn" - ] - } - } - }, - "DeregisterBatchMNPJobDefinitionsFunction": { - "Type": "AWS::Lambda::Function", - "Properties": { - "Code": { - "S3Bucket": { - "Ref": "ResourcesS3Bucket" - }, - "S3Key": "custom_resources_code/artifacts.zip" - }, - "Handler": "deregister_batch_mnp_job_definitions.handler", - "MemorySize": 128, - "Role": { - "Fn::GetAtt": [ - "DeregisterBatchMNPJobDefinitionsFunctionExecutionRole", - "Arn" - ] - }, - "Runtime": "python3.6", - "Timeout": 120 - } - }, - "DeregisterBatchMNPJobDefinitionsFunctionExecutionRole": { - "Type": "AWS::IAM::Role", - "Properties": { - "AssumeRolePolicyDocument": { - "Statement": [ - { - "Action": [ - "sts:AssumeRole" - ], - "Effect": "Allow", - "Principal": { - "Service": [ - { - "Fn::Sub": "lambda.${S3Url}" - } - ] - } - } - ], - "Version": "2012-10-17" - }, - "Path": "/", - "Policies": [ - { - "PolicyDocument": { - "Statement": [ - { - "Action": [ - "logs:CreateLogStream", - "logs:PutLogEvents" - ], - "Effect": "Allow", - "Resource": "arn:aws:logs:*:*:*", - "Sid": "CloudWatchLogsPolicy" - }, - { - "Action": [ - "batch:*" - ], - "Effect": "Allow", - "Resource": "*", - "Sid": "Batch" - } - ], - "Version": "2012-10-17" - }, - "PolicyName": "LambdaPolicy" - } - ] - } - }, - "DeregisterBatchMNPJobDefinitionsFunctionLogGroup": { - "Type": "AWS::Logs::LogGroup", - "Properties": { - "LogGroupName": { - "Fn::Sub": "/aws/lambda/${DeregisterBatchMNPJobDefinitionsFunction}" - }, - "RetentionInDays": 1 - } } }, "Outputs": { diff --git a/cloudformation/ebs-substack.cfn.json b/cloudformation/ebs-substack.cfn.json index a2db553425..a25089a8ce 100644 --- a/cloudformation/ebs-substack.cfn.json +++ b/cloudformation/ebs-substack.cfn.json @@ -1049,7 +1049,7 @@ "Parameters": { "AvailabilityZone": { "Description": "Availability Zone the cluster will launch into. THIS IS REQUIRED", - "Type": "String" + "Type": "AWS::EC2::AvailabilityZone::Name" }, "EBSEncryption": { "Description": "Boolean flag to use EBS encryption for /shared volume. (Not to be used for snapshots)", diff --git a/cloudformation/efs-substack.cfn.json b/cloudformation/efs-substack.cfn.json index d94bd9d5f7..6f9a10b3dd 100644 --- a/cloudformation/efs-substack.cfn.json +++ b/cloudformation/efs-substack.cfn.json @@ -201,7 +201,7 @@ "Parameters": { "ComputeSecurityGroup": { "Description": "SecurityGroup for Mount Target", - "Type": "String" + "Type": "AWS::EC2::SecurityGroup::Id" }, "EFSOptions": { "Description": "Comma separated list of efs related options, 8 parameters in total", diff --git a/cloudformation/raid-substack.cfn.json b/cloudformation/raid-substack.cfn.json index 63159ef4e7..006139de7f 100644 --- a/cloudformation/raid-substack.cfn.json +++ b/cloudformation/raid-substack.cfn.json @@ -641,7 +641,7 @@ "Parameters": { "AvailabilityZone": { "Description": "Availability Zone the cluster will launch into. THIS IS REQUIRED", - "Type": "String" + "Type": "AWS::EC2::AvailabilityZone::Name" }, "RAIDOptions": { "Description": "Comma separated list of RAID related options, 8 parameters in total, [0 shared_dir,1 raid_type,2 num_of_vols,3 vol_type,4 vol_size,5 vol_IOPS,6 encrypted, 7 ebs_kms_key]", diff --git a/docs/conf.py b/docs/conf.py index 453e2252ea..60cb23c822 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -51,9 +51,9 @@ # built documents. # # The short X.Y version. -version = '2.3' +version = '2.4' # The full version, including alpha/beta/rc tags. -release = '2.3.1' +release = '2.4.0' # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. diff --git a/docs/configuration.rst b/docs/configuration.rst index 4a10f6e0e7..3ed1f24f34 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -3,6 +3,8 @@ Configuration .. toctree:: ParallelCluster uses the file ``~/.parallelcluster/config`` by default for all configuration parameters. +You can change the location of the config file via the ``--config`` command option or by setting the +``AWS_PCLUSTER_CONFIG_FILE`` environment variable. An example configuration file can be found at ``site-packages/aws-parallelcluster/examples/config``. @@ -55,16 +57,10 @@ Attempt to validate the existence of the resources defined in parameters. :: aws ^^^ -AWS credentials/region section. +AWS Region section. -These settings apply to all clusters and are REQUIRED. - -For security purposes, AWS highly recommends using the environment, EC2 IAM Roles, or the -`AWS CLI `_ to store credentials rather than saving into the AWS ParallelCluster config file. :: - - [aws] - aws_access_key_id = #your_aws_access_key_id - aws_secret_access_key = #your_secret_access_key +To store credentials, you can use environment variables, IAM roles, or the preferred way, the +`AWS CLI `_ :: # Defaults to us-east-1 if not defined in environment or below aws_region_name = #region @@ -106,6 +102,8 @@ template_url """""""""""" Defines the path to the CloudFormation template used to create the cluster. +Updates use the template the stack was created with. + Defaults to ``https://s3.amazonaws.com/-aws-parallelcluster/templates/aws-parallelcluster-.cfn.json``. :: @@ -423,10 +421,8 @@ that commercial entails all supported regions including us-east-1, us-west-2, et region alinux centos6 centos7 ubuntu1404 ubuntu1604 ============== ====== ============ ============ ============= ============ commercial True True True True True - us-gov-west-1 True False False True True - us-gov-east-1 True False False True True - cn-north-1 True False False True True - cn-northwest-1 True False False False False + govcloud True False False True True + china True False False True True ============== ====== ============ ============ ============= ============ Note: The base_os determines the username used to log into the cluster. @@ -597,6 +593,9 @@ If true, an Elastic IP will be associated to the Master instance. If false, the Master instance will have a Public IP (or not) according to the value of the "Auto-assign Public IP" subnet configuration parameter. +.. note:: + This parameter can't be set to false if :code:`compute_subnet_cidr` is specified. + See :ref:`networking configuration ` for some examples. Defaults to true. :: diff --git a/docs/custom_cookbook.rst b/docs/custom_cookbook.rst index 5b62e77f08..d17968e31f 100644 --- a/docs/custom_cookbook.rst +++ b/docs/custom_cookbook.rst @@ -21,7 +21,7 @@ Steps #. Upload the cookbook, changing ``[your_bucket]`` to a bucket you own :: - $ cd aws-parallelcluster-cookbook/utils + $ cd aws-parallelcluster-cookbook $ /bin/bash util/uploadCookbook.sh --bucket [your_bucket] --srcdir . #. From the output above, add the following variable to the AWS ParallelCluster config file, under the ``[cluster ...]`` section :: diff --git a/docs/getting_started.rst b/docs/getting_started.rst index ccacf5bbdb..1346dada2d 100644 --- a/docs/getting_started.rst +++ b/docs/getting_started.rst @@ -65,6 +65,17 @@ was originally installed: Configuring AWS ParallelCluster =============================== +First you'll need to setup your IAM credentials, see `AWS CLI `_ +for more information. + +:: + + $ aws configure + AWS Access Key ID [None]: AKIAIOSFODNN7EXAMPLE + AWS Secret Access Key [None]: wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY + Default region name [us-east-1]: us-east-1 + Default output format [None]: + Once installed you will need to setup some initial config. The easiest way to do this is below: :: @@ -79,15 +90,6 @@ cluster from. Cluster Template [mycluster]: -Next, you will be prompted for your AWS Access & Secret Keys. Enter the keys for an IAM user with administrative -privileges. -These can also be read from your environment variables or the AWS CLI config. - -:: - - AWS Access Key ID []: - AWS Secret Access Key ID []: - Now, you will be presented with a list of valid AWS region identifiers. Choose the region in which you'd like your cluster to run. @@ -150,7 +152,7 @@ Next, a simple cluster launches into a VPC and uses an existing subnet which sup for the subnet is :code:`0.0.0.0/0 => igw-xxxxxx`. The VPC must have :code:`DNS Resolution = yes` and :code:`DNS Hostnames = yes`. It should also have DHCP options with the correct :code:`domain-name` for the region, as defined in the docs: `VPC DHCP -Options `_. +Options `_. Once all of those settings contain valid values, you can launch the cluster by running the create command: @@ -234,3 +236,6 @@ to allow Inbound connection to the port 80 from your Public IP. .. spelling:: aws + wJalrXUtnFEMI + MDENG + bPxRfiCYEXAMPLEKEY diff --git a/docs/iam.rst b/docs/iam.rst index 40787131e3..276ee17d03 100644 --- a/docs/iam.rst +++ b/docs/iam.rst @@ -86,7 +86,8 @@ In case you are using SGE, Slurm or Torque as a scheduler: "autoscaling:TerminateInstanceInAutoScalingGroup", "autoscaling:SetDesiredCapacity", "autoscaling:DescribeTags", - "autoScaling:UpdateAutoScalingGroup" + "autoScaling:UpdateAutoScalingGroup", + "autoscaling:SetInstanceHealth" ], "Sid": "Autoscaling", "Effect": "Allow" diff --git a/docs/networking.rst b/docs/networking.rst index 3c1e0f3274..d0863f7ad5 100644 --- a/docs/networking.rst +++ b/docs/networking.rst @@ -47,6 +47,7 @@ The configuration to create a new private subnet for compute instances requires vpc_id = vpc-xxxxxx master_subnet_id = subnet- compute_subnet_cidr = 10.0.1.0/24 + use_public_ips = true The configuration to use an existing private network requires the following settings: @@ -57,9 +58,10 @@ The configuration to use an existing private network requires the following sett master_subnet_id = subnet- compute_subnet_id = subnet- -Both these configuration require to have a `NAT Gateway -`_ -or an internal PROXY to enable web access for compute instances. +.. note:: + This second configuration requires a `NAT Gateway + `_ + or an internal PROXY to enable web access for compute instances. AWS ParallelCluster in a single private subnet connected using Direct Connect ----------------------------------------------------------------------------- diff --git a/docs/pre_post_install.rst b/docs/pre_post_install.rst index bb31165822..313af5dcf5 100644 --- a/docs/pre_post_install.rst +++ b/docs/pre_post_install.rst @@ -7,19 +7,39 @@ AWS ParallelCluster can execute arbitrary code either before(pre) or after(post) cluster creation. This code is typically stored in S3 and accessed via HTTP(S) during cluster creation. The code will be executed as root and can be in any script language supported by the cluster OS, typically `bash` or `python`. -pre-install actions are called before any cluster deployment bootstrap such as configuring NAT, EBS and the scheduler. +Pre-install actions are called before any cluster deployment bootstrap such as configuring NAT, EBS and the scheduler. Typical pre-install actions may include modifying storage, adding extra users or packages. -post-install actions are called after cluster bootstrap is complete, as the last action before an instance is +Post-install actions are called after cluster bootstrap is complete, as the last action before an instance is considered complete. Typical post-install actions may include changing scheduler settings, modifying storage or packages. -Arguments can be passed to scripts by specifying them in the config. These will be passed double-quoted to the -pre/post-install actions. +Arguments can be passed to scripts by specifying them in the config. If a pre/post-install actions fails, then the instance bootstrap will be considered failed and it will not continue. Success is signalled with an exit code of 0, any other exit code will be considered a fail. +It is possible to differentiate between master and compute nodes execution by sourcing +the ``/etc/parallelcluster/cfnconfig`` file and evaluating the ``cfn_node_type`` environment variable, +whose possible values are ``MasterServer`` and ``ComputeFleet`` for the master and compute node respectively. + +:: + + #!/bin/bash + + . "/etc/parallelcluster/cfnconfig" + + case "${cfn_node_type}" in + MasterServer) + echo "I am the master" >> /tmp/master.txt + ;; + ComputeFleet) + echo "I am a compute node" >> /tmp/compute.txt + ;; + *) + ;; + esac + Configuration ------------- @@ -28,22 +48,19 @@ are not required for basic cluster install. :: - # URL to a preinstall script. This is executed before any of the boot_as_* scripts are run - # (defaults to NONE) + # URL to a preinstall script. This is executed before any of the boot_as_* scripts are run (defaults to NONE) pre_install = NONE - # Arguments to be passed to preinstall script - # (defaults to NONE) + # Arguments to be passed to preinstall script (defaults to NONE) pre_install_args = NONE - # URL to a postinstall script. This is executed after any of the boot_as_* scripts are run - # (defaults to NONE) + # URL to a postinstall script. This is executed after any of the boot_as_* scripts are run (defaults to NONE) post_install = NONE - # Arguments to be passed to postinstall script - # (defaults to NONE) + # Arguments to be passed to postinstall script (defaults to NONE) post_install_args = NONE Arguments --------- The first two arguments ``$0`` and ``$1`` are reserved for the script name and url. +If the pre/post_install_args variable contains a list of parameters it must be double quoted. See example below. :: @@ -51,18 +68,30 @@ The first two arguments ``$0`` and ``$1`` are reserved for the script name and u $1 => s3 url $n => args set by pre/post_install_args +Output +------ +The output of the pre/post-install scripts can be found in the ``/var/log/cfn-init.log`` +and ``/var/log/cfn-init-cmd.log`` files. + Example ------- -The following are some steps to create a simple post install script that installs the R packages in a cluster. +The following are some steps to create a simple post install script that installs a list of packages, specified by the +``post_install_args`` configuration parameter, in a cluster. -1. Create a script. For the R example, see below +1. Create a script :: #!/bin/bash - yum -y install --enablerepo=epel R + echo "post-install script has $# arguments" + for arg in "$@" + do + echo "arg: ${arg}" + done + + yum -y install "${@:2}" 2. Upload the script with the correct permissions to S3 @@ -75,6 +104,7 @@ The following are some steps to create a simple post install script that install [cluster default] ... post_install = https://.s3.amazonaws.com/myscript.sh + post_install_args = "R curl wget" If the bucket does not have public-read permission use ``s3`` as URL scheme. @@ -83,8 +113,25 @@ If the bucket does not have public-read permission use ``s3`` as URL scheme. [cluster default] ... post_install = s3:///myscript.sh - + post_install_args = "R curl wget" 4. Launch a cluster ``pcluster create mycluster`` + + +5. Verify the output + +:: + + $ less /var/log/cfn-init.log + 2019-04-11 10:43:54,588 [DEBUG] Command runpostinstall output: post-install script has 4 arguments + arg: s3://eu-eu-west-1/test.sh + arg: R + arg: curl + arg: wget + Loaded plugins: dkms-build-requires, priorities, update-motd, upgrade-helper + Package R-3.4.1-1.52.amzn1.x86_64 already installed and latest version + Package curl-7.61.1-7.91.amzn1.x86_64 already installed and latest version + Package wget-1.18-4.29.amzn1.x86_64 already installed and latest version + Nothing to do diff --git a/docs/s3_resources.rst b/docs/s3_resources.rst index a366c2c09d..629b065a8c 100644 --- a/docs/s3_resources.rst +++ b/docs/s3_resources.rst @@ -31,3 +31,9 @@ This next example gives you read access to the bucket. This does **not** let you :: s3_read_resource = arn:aws:s3:::my_corporate_bucket + +This last example gives you read access to the bucket and to the items stored in the bucket. + +:: + + s3_read_resource = arn:aws:s3:::my_corporate_bucket* diff --git a/docs/tutorials/02_ami_customization.rst b/docs/tutorials/02_ami_customization.rst index 28c2e3f3cf..35c0c90894 100644 --- a/docs/tutorials/02_ami_customization.rst +++ b/docs/tutorials/02_ami_customization.rst @@ -23,9 +23,16 @@ tutorial will guide you through the process. How to customize the AWS ParallelCluster AMI ============================================ -There are three ways to use a custom AWS ParallelCluster AMI, two of them require to build a new AMI that will be -available under your AWS account and one does not require to build anything in advance. Feel free to select the -appropriate method based on your needs. +There are three alternative ways to use a custom AWS ParallelCluster AMI, two of them require to build a new AMI that +will be available under your AWS account and one does not require to build anything in advance: + +- modify an AWS ParallelCluster AMI, when you want to install your software on top of an official AWS ParalleCluster AMI +- build a custom AWS ParallelCluster AMI, when you have an AMI with customization and software already in place, and + want to build an AWS ParalleCluster AMI on top of it +- use a Custom AMI at runtime, when you don't want to create anything in advance, AWS ParallelCluster will install + everything it needs at runtime (during cluster creation time and scale-up time) + +Feel free to select the appropriate method based on your needs. Modify an AWS ParallelCluster AMI --------------------------------- @@ -35,10 +42,13 @@ the components required for AWS ParallelCluster to function installed and config base. #. Find the AMI which corresponds to the region you will be utilizing from the AMI list. - The AMI list to use must match the version of the product e.g. + .. warning:: + The AMI list to use must match the version of AWS ParallelCluster, for example: - - for ParallelCluster 2.0.2 -> https://github.com/aws/aws-parallelcluster/blob/v2.0.2/amis.txt - - for CfnCluster 1.6.1 -> https://github.com/aws/aws-parallelcluster/blob/v1.6.1/amis.txt + - for AWS ParallelCluster 2.3.1 -> https://github.com/aws/aws-parallelcluster/blob/v2.3.1/amis.txt + - for AWS ParallelCluster 2.2.1 -> https://github.com/aws/aws-parallelcluster/blob/v2.2.1/amis.txt + - for AWS ParallelCluster 2.1.1 -> https://github.com/aws/aws-parallelcluster/blob/v2.1.1/amis.txt + - for CfnCluster 1.6.1 -> https://github.com/aws/aws-parallelcluster/blob/v1.6.1/amis.txt #. Within the EC2 Console, choose "Launch Instance". #. Navigate to "Community AMIs", and enter the AMI id for your region into the search box. @@ -77,6 +87,9 @@ starting from the one you provide as base:: pcluster createami --ami-id --os +.. warning:: + You cannot use a ParalleCluster AMI as for the create command or the create will fail. + For other parameters, please consult the command help:: pcluster createami -h diff --git a/tests/integration-tests/README.md b/tests/integration-tests/README.md index d62b46c4ae..432125b1c6 100644 --- a/tests/integration-tests/README.md +++ b/tests/integration-tests/README.md @@ -11,6 +11,8 @@ config generation. ## Run Integration Tests +To run the integration tests you have to use Python 3.7. + Before executing integration tests it is required to install all the python dependencies required by the framework. In order to do that simply run the following command: ```bash @@ -140,6 +142,33 @@ The configuration for the custom templates and packages are automatically inject all cluster configs when these are rendered. In case any of these parameters is already set in the cluster config then the value in the config is used. +### Re-use clusters and vpc clusters + +When developing integration tests, it can be helpful to re-use a cluster between tests. +This is easily accomplished with the use of the `--vpc-stack` and `--cluster` flags. + +If you're starting from scratch, run the test with the `--no-delete` flag. +This preserves any stacks created for the test: + +```bash +python -m test_runner \ + ... + --no-delete +``` + +Then when you have a vpc stack and cluster, reference them when starting a test: + +```bash +python -m test_runner \ + ... + --vpc-stack "integ-tests-vpc-ncw7zrccsau8uh6k" + --cluster "efa-demo" + --no-delete +``` + +Keep in mind, the cluster you pass can have different `scheduler`, `os` or other features +than what is specified in the test. This can break the tests in unexpected ways. Be mindful. + ## Write Integration Tests All integration tests are defined in the `integration-tests/tests` directory. diff --git a/tests/integration-tests/cfn_stacks_factory.py b/tests/integration-tests/cfn_stacks_factory.py index 6eec8ac3e8..25d8fb40c2 100644 --- a/tests/integration-tests/cfn_stacks_factory.py +++ b/tests/integration-tests/cfn_stacks_factory.py @@ -47,8 +47,8 @@ def __init__(self): def create_stack(self, stack): """ Create a cfn stack with a given template. + :param stack: stack to create. - :return: """ name = stack.name region = stack.region diff --git a/tests/integration-tests/clusters_factory.py b/tests/integration-tests/clusters_factory.py index 031ed28cc0..cfe233728c 100644 --- a/tests/integration-tests/clusters_factory.py +++ b/tests/integration-tests/clusters_factory.py @@ -21,7 +21,7 @@ class Cluster: """Contain all static and dynamic data related to a cluster instance.""" - def __init__(self, name, config_file, ssh_key): + def __init__(self, name, ssh_key, config_file): self.name = name self.config_file = config_file self.ssh_key = ssh_key @@ -30,6 +30,31 @@ def __init__(self, name, config_file, ssh_key): self.__cfn_outputs = None self.__cfn_resources = None + def update(self, reset_desired=False, extra_params=None): + """ + Update a cluster with an already updated config. + :param reset_desired: reset the current ASG desired capacity to initial config values + :param extra_params: extra parameters to pass to stack update + """ + # update the cluster + logging.info("Updating cluster {0} with config {1}".format(self.name, self.config_file)) + command = ["pcluster", "update", "--config", self.config_file] + if reset_desired: + command.append("--reset-desired") + if extra_params: + command.extend(["--extra-parameters", extra_params]) + command.append(self.name) + result = run_command(command) + if "Status: {0} - UPDATE_COMPLETE".format(self.cfn_name) not in result.stdout: + error = "Cluster update failed for {0} with output: {1}".format(self.name, result.stdout) + logging.error(error) + raise Exception(error) + logging.info("Cluster {0} updated successfully".format(self.name)) + + # reset cached properties + self.__cfn_outputs = None + self.__cfn_resources = None + @property def cfn_name(self): """Return the name of the CloudFormation stack associated to the cluster.""" @@ -96,7 +121,7 @@ def create_cluster(self, cluster): # create the cluster logging.info("Creating cluster {0} with config {1}".format(name, config)) self.__created_clusters[name] = cluster - result = run_command(["pcluster", "create", "--config", config, name]) + result = run_command(["pcluster", "create", "--norollback", "--config", config, name]) if "Status: {0} - CREATE_COMPLETE".format(cluster.cfn_name) not in result.stdout: error = "Cluster creation failed for {0} with output: {1}".format(name, result.stdout) logging.error(error) diff --git a/tests/integration-tests/conftest.py b/tests/integration-tests/conftest.py index 6aeb851890..0589ca35ea 100644 --- a/tests/integration-tests/conftest.py +++ b/tests/integration-tests/conftest.py @@ -22,6 +22,7 @@ import configparser import pytest +from retrying import retry from cfn_stacks_factory import CfnStack, CfnStacksFactory from clusters_factory import Cluster, ClustersFactory @@ -55,12 +56,17 @@ def pytest_addoption(parser): parser.addoption("--custom-awsbatchcli-package", help="url to a custom awsbatch cli package") parser.addoption("--custom-node-package", help="url to a custom node package") parser.addoption("--custom-ami", help="custom AMI to use in the tests") + parser.addoption("--vpc-stack", help="Name of an existing vpc stack.") + parser.addoption("--cluster", help="Use an existing cluster instead of creating one.") + parser.addoption( + "--no-delete", action="store_true", default=False, help="Don't delete stacks after tests are complete." + ) def pytest_generate_tests(metafunc): """Generate (multiple) parametrized calls to a test function.""" - _parametrize_from_option(metafunc, "instance", "instances") _parametrize_from_option(metafunc, "region", "regions") + _parametrize_from_option(metafunc, "instance", "instances") _parametrize_from_option(metafunc, "os", "oss") _parametrize_from_option(metafunc, "scheduler", "schedulers") @@ -89,24 +95,24 @@ def pytest_configure(config): def pytest_runtest_call(item): """Called to execute the test item.""" _add_properties_to_report(item) - add_default_markers(item) - - check_marker_list(item, "instances", "instance") - check_marker_list(item, "regions", "region") - check_marker_list(item, "oss", "os") - check_marker_list(item, "schedulers", "scheduler") - check_marker_skip_list(item, "skip_instances", "instance") - check_marker_skip_list(item, "skip_regions", "region") - check_marker_skip_list(item, "skip_oss", "os") - check_marker_skip_list(item, "skip_schedulers", "scheduler") - check_marker_dimensions(item) - check_marker_skip_dimensions(item) - logging.info("Running test " + item.name) def pytest_collection_modifyitems(items): """Called after collection has been performed, may filter or re-order the items in-place.""" + add_default_markers(items) + + check_marker_list(items, "instances", "instance") + check_marker_list(items, "regions", "region") + check_marker_list(items, "oss", "os") + check_marker_list(items, "schedulers", "scheduler") + check_marker_skip_list(items, "skip_instances", "instance") + check_marker_skip_list(items, "skip_regions", "region") + check_marker_skip_list(items, "skip_oss", "os") + check_marker_skip_list(items, "skip_schedulers", "scheduler") + check_marker_dimensions(items) + check_marker_skip_dimensions(items) + _add_filename_markers(items) @@ -162,15 +168,19 @@ def clusters_factory(request): def _cluster_factory(cluster_config): cluster_config = _write_cluster_config_to_outdir(request, cluster_config) cluster = Cluster( - name="integ-tests-" + random_alphanumeric(), + name=request.config.getoption("cluster") + if request.config.getoption("cluster") + else "integ-tests-" + random_alphanumeric(), config_file=cluster_config, ssh_key=request.config.getoption("key_path"), ) - factory.create_cluster(cluster) + if not request.config.getoption("cluster"): + factory.create_cluster(cluster) return cluster yield _cluster_factory - factory.destroy_all_clusters() + if not request.config.getoption("no_delete"): + factory.destroy_all_clusters() def _write_cluster_config_to_outdir(request, cluster_config): @@ -180,7 +190,7 @@ def _write_cluster_config_to_outdir(request, cluster_config): exist_ok=True, ) cluster_config_dst = "{out_dir}/clusters_configs/{test_name}.config".format( - out_dir=out_dir, test_name=request.node.nodeid + out_dir=out_dir, test_name=request.node.nodeid.replace("::", "-") ) copyfile(cluster_config, cluster_config_dst) return cluster_config_dst @@ -266,11 +276,12 @@ def _get_default_template_values(vpc_stacks, region, request): @pytest.fixture(scope="session") -def cfn_stacks_factory(): +def cfn_stacks_factory(request): """Define a fixture to manage the creation and destruction of CloudFormation stacks.""" factory = CfnStacksFactory() yield factory - factory.delete_all_stacks() + if not request.config.getoption("no_delete"): + factory.delete_all_stacks() # FIXME: we need to find a better solution to this since AZs are independently mapped to names for each AWS account. @@ -283,6 +294,12 @@ def cfn_stacks_factory(): "us-west-2": ["us-west-2a", "us-west-2b", "us-west-2c"], # c5.xlarge is not supported in ap-southeast-2a "ap-southeast-2": ["ap-southeast-2b", "ap-southeast-2c"], + # c4.xlarge is not supported in ap-northeast-2b + "ap-northeast-2": ["ap-northeast-2a", "ap-northeast-2c"], + # c5.xlarge is not supported in ap-southeast-1c + "ap-southeast-1": ["ap-southeast-1a", "ap-southeast-1b"], + # c4.xlarge is not supported in ap-south-1c + "ap-south-1": ["ap-south-1a", "ap-south-1b"], } @@ -311,13 +328,24 @@ def vpc_stacks(cfn_stacks_factory, request): ) vpc_config = VPCConfig(subnets=[public_subnet, private_subnet]) template = VPCTemplateBuilder(vpc_config).build() - stack = CfnStack(name="integ-tests-vpc-" + random_alphanumeric(), region=region, template=template.to_json()) - cfn_stacks_factory.create_stack(stack) - vpc_stacks[region] = stack + vpc_stacks[region] = _create_vpc_stack(request, template, region, cfn_stacks_factory) return vpc_stacks +# If stack creation fails it'll retry once more. This is done to mitigate failures due to resources +# not available in randomly picked AZs. +@retry(stop_max_attempt_number=2, wait_fixed=5000) +def _create_vpc_stack(request, template, region, cfn_stacks_factory): + if request.config.getoption("vpc_stack"): + logging.info("Using stack {0} in region {1}".format(request.config.getoption("vpc_stack"), region)) + stack = CfnStack(name=request.config.getoption("vpc_stack"), region=region, template=template.to_json()) + else: + stack = CfnStack(name="integ-tests-vpc-" + random_alphanumeric(), region=region, template=template.to_json()) + cfn_stacks_factory.create_stack(stack) + return stack + + @pytest.fixture(scope="function") def s3_bucket_factory(region): """ diff --git a/tests/integration-tests/conftest_markers.py b/tests/integration-tests/conftest_markers.py index 5e7115abba..9116785ef2 100644 --- a/tests/integration-tests/conftest_markers.py +++ b/tests/integration-tests/conftest_markers.py @@ -35,17 +35,18 @@ class InvalidMarkerError(Exception): pass -def add_default_markers(item): +def add_default_markers(items): """ Add default markers for dimensions that need to be skipped by default for all tests. - :param item: pytest Item object markers are applied to. + :param items: pytest Item object markers are applied to. """ - for dimensions in UNSUPPORTED_DIMENSIONS: - item.add_marker(pytest.mark.skip_dimensions(*dimensions)) + for item in items: + for dimensions in UNSUPPORTED_DIMENSIONS: + item.add_marker(pytest.mark.skip_dimensions(*dimensions)) -def check_marker_list(item, marker_name, arg_name): +def check_marker_list(items, marker_name, arg_name): """ Skip all tests that are annotated with marker marker_name and have the arg value corresponding to arg_name not listed in the list passed as first argument to the marker. @@ -56,33 +57,34 @@ def test(arg_name) The test is executed only if arg_name is equal to "value1" or "value2". - :param item: pytest Item object annotated with markers. + :param items: pytest Item objects annotated with markers. :param marker_name: name of the marker to process. :param arg_name: arg name the marker values should be compared to. """ - arg_value = item.funcargs.get(arg_name) - allowed_values = [] - for marker in item.iter_markers(name=marker_name): - _validate_marker(marker_name, [marker_name + "_list"], len(marker.args)) - allowed_values.extend(marker.args[0]) - - if not allowed_values or arg_value in allowed_values: - return - skip_message = ( - "Skipping test {test_name} because {arg_name} {arg_value} is not in {marker} allowed values: " - "{allowed_values}".format( - test_name=item.name, - arg_name=arg_name, - arg_value=arg_value, - marker=marker_name, - allowed_values=allowed_values, + for item in list(items): + arg_value = item.callspec.params.get(arg_name) + allowed_values = [] + for marker in item.iter_markers(name=marker_name): + _validate_marker(marker_name, [marker_name + "_list"], len(marker.args)) + allowed_values.extend(marker.args[0]) + + if not allowed_values or arg_value in allowed_values: + continue + skip_message = ( + "Skipping test {test_name} because {arg_name} {arg_value} is not in {marker} allowed values: " + "{allowed_values}".format( + test_name=item.name, + arg_name=arg_name, + arg_value=arg_value, + marker=marker_name, + allowed_values=allowed_values, + ) ) - ) - logging.info(skip_message) - pytest.skip(skip_message) + logging.info(skip_message) + items.remove(item) -def check_marker_skip_list(item, marker_name, arg_name): +def check_marker_skip_list(items, marker_name, arg_name): """ Skip all tests that are annotated with marker marker_name and have the arg value corresponding to arg_name listed in the list passed as first argument to the marker. @@ -93,30 +95,31 @@ def test(arg_name) The test is executed only if arg_name is not equal to "value1" or "value2". - :param item: pytest Item object annotated with markers. + :param items: pytest Item objects annotated with markers. :param marker_name: name of the marker to process. :param arg_name: arg name the marker values should be compared to. """ - arg_value = item.funcargs.get(arg_name) - for marker in item.iter_markers(name=marker_name): - _validate_marker(marker_name, [marker_name + "_skip_list"], len(marker.args)) - skip_values = marker.args[0] - if arg_value in skip_values: - skip_message = ( - "Skipping test {test_name} because {arg_name} {arg_value} is in {marker} allowed values:" - "{skip_values}".format( - test_name=item.name, - arg_name=arg_name, - arg_value=arg_value, - marker=marker_name, - skip_values=skip_values, + for item in list(items): + arg_value = item.callspec.params.get(arg_name) + for marker in item.iter_markers(name=marker_name): + _validate_marker(marker_name, [marker_name + "_skip_list"], len(marker.args)) + skip_values = marker.args[0] + if arg_value in skip_values: + skip_message = ( + "Skipping test {test_name} because {arg_name} {arg_value} is in {marker} allowed values:" + "{skip_values}".format( + test_name=item.name, + arg_name=arg_name, + arg_value=arg_value, + marker=marker_name, + skip_values=skip_values, + ) ) - ) - logging.info(skip_message) - pytest.skip(skip_message) + logging.info(skip_message) + items.remove(item) -def check_marker_skip_dimensions(item): +def check_marker_skip_dimensions(items): """ Skip all tests that are annotated with @pytest.mark.skip_dimensions and have the args (region, instance, os, scheduler) match those specified in the marker. @@ -130,34 +133,36 @@ def test(region, instance, os, scheduler) The test is executed only if the test args (region, instance, os, scheduler) do not match ("a", "b", "*", "d") - :param item: pytest Item object annotated with markers. + :param items: pytest Item objects annotated with markers. """ marker_name = "skip_dimensions" - args_values = [] - for dimension in DIMENSIONS_MARKER_ARGS: - args_values.append(item.funcargs.get(dimension)) - for marker in item.iter_markers(name=marker_name): - _validate_marker(marker_name, DIMENSIONS_MARKER_ARGS, len(marker.args)) - if len(marker.args) != len(DIMENSIONS_MARKER_ARGS): - logging.error( - "Marker {marker_name} requires the following args: {args}".format( - marker_name=marker_name, args=DIMENSIONS_MARKER_ARGS + for item in list(items): + args_values = [] + for dimension in DIMENSIONS_MARKER_ARGS: + args_values.append(item.callspec.params.get(dimension)) + for marker in item.iter_markers(name=marker_name): + _validate_marker(marker_name, DIMENSIONS_MARKER_ARGS, len(marker.args)) + if len(marker.args) != len(DIMENSIONS_MARKER_ARGS): + logging.error( + "Marker {marker_name} requires the following args: {args}".format( + marker_name=marker_name, args=DIMENSIONS_MARKER_ARGS + ) ) - ) - raise ValueError - dimensions_match = _compare_dimension_lists(args_values, marker.args) - if dimensions_match: - skip_message = ( - "Skipping test {test_name} because dimensions {args_values} match {marker}: " - "{skip_values}".format( - test_name=item.name, args_values=args_values, marker=marker_name, skip_values=marker.args + raise ValueError + dimensions_match = _compare_dimension_lists(args_values, marker.args) + if dimensions_match: + skip_message = ( + "Skipping test {test_name} because dimensions {args_values} match {marker}: " + "{skip_values}".format( + test_name=item.name, args_values=args_values, marker=marker_name, skip_values=marker.args + ) ) - ) - logging.info(skip_message) - pytest.skip(skip_message) + logging.info(skip_message) + items.remove(item) + break -def check_marker_dimensions(item): +def check_marker_dimensions(items): """ Execute all tests that are annotated with @pytest.mark.dimensions and have the args (region, instance, os, scheduler) match those specified in the marker. @@ -170,29 +175,34 @@ def test(region, instance, os, scheduler) The test is executed only if the test args (region, instance, os, scheduler) match ("a", "b", "*", "d") - :param item: pytest Item object annotated with markers. + :param items: pytest Item objects annotated with markers. """ marker_name = "dimensions" - test_args_value = [] - for dimension in DIMENSIONS_MARKER_ARGS: - test_args_value.append(item.funcargs.get(dimension)) - allowed_values = [] - for marker in item.iter_markers(name=marker_name): - _validate_marker(marker_name, DIMENSIONS_MARKER_ARGS, len(marker.args)) - allowed_values.append(marker.args) - dimensions_match = _compare_dimension_lists(test_args_value, marker.args) - if dimensions_match: - return - - if allowed_values: - skip_message = ( - "Skipping test {test_name} because dimensions {test_args_value} do not match any marker {marker} values: " - "{allowed_values}".format( - test_name=item.name, test_args_value=test_args_value, marker=marker_name, allowed_values=allowed_values + for item in list(items): + test_args_value = [] + for dimension in DIMENSIONS_MARKER_ARGS: + test_args_value.append(item.callspec.params.get(dimension)) + allowed_values = [] + dimensions_match = False + for marker in item.iter_markers(name=marker_name): + _validate_marker(marker_name, DIMENSIONS_MARKER_ARGS, len(marker.args)) + allowed_values.append(marker.args) + dimensions_match = _compare_dimension_lists(test_args_value, marker.args) + if dimensions_match: + break + + if not dimensions_match and allowed_values: + skip_message = ( + "Skipping test {test_name} because dimensions {test_args_value} do not match any marker {marker}" + " values: {allowed_values}".format( + test_name=item.name, + test_args_value=test_args_value, + marker=marker_name, + allowed_values=allowed_values, + ) ) - ) - logging.info(skip_message) - pytest.skip(skip_message) + logging.info(skip_message) + items.remove(item) def _validate_marker(marker_name, expected_args, args_count): diff --git a/tests/integration-tests/remote_command_executor.py b/tests/integration-tests/remote_command_executor.py index 8e14fb3aa7..ee7f5a7d39 100644 --- a/tests/integration-tests/remote_command_executor.py +++ b/tests/integration-tests/remote_command_executor.py @@ -50,7 +50,9 @@ def __del__(self): # Catch all exceptions if we fail to close the clients logging.warning("Exception raised when closing remote ssh client: {0}".format(e)) - def run_remote_command(self, command, log_error=True, additional_files=None, raise_on_error=True, login_shell=True): + def run_remote_command( + self, command, log_error=True, additional_files=None, raise_on_error=True, login_shell=True, hide=False + ): """ Execute remote command on the cluster master node. @@ -59,6 +61,7 @@ def run_remote_command(self, command, log_error=True, additional_files=None, rai :param additional_files: additional files to copy before executing script. :param raise_on_error: if True raises a RemoteCommandExecutionError on failures :param login_shell: if True prepends /bin/bash --login -c to the given command + :param hide: do not print command output to the local stdout :return: result of the execution. """ if isinstance(command, list): @@ -68,7 +71,7 @@ def run_remote_command(self, command, log_error=True, additional_files=None, rai if login_shell: command = "/bin/bash --login -c {0}".format(shlex.quote(command)) - result = self.__connection.run(command, warn=True, pty=True, hide=False) + result = self.__connection.run(command, warn=True, pty=True, hide=hide) result.stdout = "\n".join(result.stdout.splitlines()) result.stderr = "\n".join(result.stderr.splitlines()) if result.failed and raise_on_error: @@ -81,7 +84,7 @@ def run_remote_command(self, command, log_error=True, additional_files=None, rai raise RemoteCommandExecutionError(result) return result - def run_remote_script(self, script_file, args=None, log_error=True, additional_files=None): + def run_remote_script(self, script_file, args=None, log_error=True, additional_files=None, hide=False): """ Execute a script remotely on the cluster master node. @@ -90,6 +93,7 @@ def run_remote_script(self, script_file, args=None, log_error=True, additional_f :param args: args to pass to the script when invoked. :param log_error: log errors. :param additional_files: additional files to copy before executing script. + :param hide: do not print command output to the local stdout :return: result of the execution. """ script_name = os.path.basename(script_file) @@ -97,7 +101,10 @@ def run_remote_script(self, script_file, args=None, log_error=True, additional_f if not args: args = [] return self.run_remote_command( - ["/bin/bash", "--login", script_name] + args, log_error=log_error, additional_files=additional_files + ["/bin/bash", "--login", script_name] + args, + log_error=log_error, + additional_files=additional_files, + hide=hide, ) def _copy_additional_files(self, files): diff --git a/tests/integration-tests/reports_generator.py b/tests/integration-tests/reports_generator.py index 89eb6d1bc7..c5f8eb89c3 100644 --- a/tests/integration-tests/reports_generator.py +++ b/tests/integration-tests/reports_generator.py @@ -58,6 +58,8 @@ def generate_json_report(test_results_dir): _record_results(results, root, "./testcase[error]/properties/property", "errors") _record_results(results, root, "./testcase/properties/property", "total") + _record_succeeded_results(results) + with open("{0}/test_report.json".format(test_results_dir), "w") as out_f: out_f.write(json.dumps(results, indent=4)) @@ -65,16 +67,23 @@ def generate_json_report(test_results_dir): def _record_results(results_dict, results_xml_root, xpath_exp, label): - for skipped in results_xml_root.findall(xpath_exp): - if not skipped.get("name") in results_dict: - results_dict[skipped.get("name")] = {} - if not skipped.get("value") in results_dict[skipped.get("name")]: - results_dict[skipped.get("name")].update({skipped.get("value"): _empty_results_dict()}) - results_dict[skipped.get("name")][skipped.get("value")][label] += 1 + for match in results_xml_root.findall(xpath_exp): + if not match.get("name") in results_dict: + results_dict[match.get("name")] = {} + if not match.get("value") in results_dict[match.get("name")]: + results_dict[match.get("name")].update({match.get("value"): _empty_results_dict()}) + results_dict[match.get("name")][match.get("value")][label] += 1 def _empty_results_dict(): return {"total": 0, "skipped": 0, "failures": 0, "errors": 0} -# generate_tabular_report("1549489575.329696.out", None, None, None, None) +def _record_succeeded_results(results): + results["all"]["succeeded"] = ( + results["all"]["total"] - results["all"]["skipped"] - results["all"]["failures"] - results["all"]["errors"] + ) + for dimension in results: + if dimension != "all": + for result in results[dimension].values(): + result["succeeded"] = result["total"] - result["skipped"] - result["failures"] - result["errors"] diff --git a/tests/integration-tests/test_runner.py b/tests/integration-tests/test_runner.py index 1023073ac3..bf353dbfc7 100644 --- a/tests/integration-tests/test_runner.py +++ b/tests/integration-tests/test_runner.py @@ -63,6 +63,9 @@ "custom_awsbatch_template_url": None, "custom_awsbatchcli_url": None, "custom_ami": None, + "vpc_stack": None, + "cluster": None, + "no_delete": False, } @@ -152,6 +155,16 @@ def _init_argparser(): parser.add_argument( "--custom-ami", help="custom AMI to use for all tests.", default=TEST_DEFAULTS.get("custom_ami") ) + parser.add_argument("--vpc-stack", help="Name of an existing vpc stack.", default=TEST_DEFAULTS.get("vpc_stack")) + parser.add_argument( + "--cluster", help="Use an existing cluster instead of creating one.", default=TEST_DEFAULTS.get("cluster") + ) + parser.add_argument( + "--no-delete", + action="store_true", + help="Don't delete stacks after tests are complete.", + default=TEST_DEFAULTS.get("no_delete"), + ) return parser @@ -199,6 +212,7 @@ def _get_pytest_args(args, regions, log_file, out_dir): pytest_args.append("--html={0}/{1}/results.html".format(args.output_dir, out_dir)) _set_custom_packages_args(args, pytest_args) + _set_custom_stack_args(args, pytest_args) return pytest_args @@ -223,6 +237,17 @@ def _set_custom_packages_args(args, pytest_args): pytest_args.extend(["--custom-ami", args.custom_ami]) +def _set_custom_stack_args(args, pytest_args): + if args.vpc_stack: + pytest_args.extend(["--vpc-stack", args.vpc_stack]) + + if args.cluster: + pytest_args.extend(["--cluster", args.cluster]) + + if args.no_delete: + pytest_args.append("--no-delete") + + def _get_pytest_regionalized_args(region, args): return _get_pytest_args( args=args, diff --git a/tests/integration-tests/tests/cfn-init/__init__.py b/tests/integration-tests/tests/cfn-init/__init__.py new file mode 100644 index 0000000000..2251b11f46 --- /dev/null +++ b/tests/integration-tests/tests/cfn-init/__init__.py @@ -0,0 +1,11 @@ +# Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). +# You may not use this file except in compliance with the License. +# A copy of the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "LICENSE.txt" file accompanying this file. +# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied. +# See the License for the specific language governing permissions and limitations under the License. diff --git a/tests/integration-tests/tests/cfn-init/test_cfn_init.py b/tests/integration-tests/tests/cfn-init/test_cfn_init.py new file mode 100644 index 0000000000..a96236297d --- /dev/null +++ b/tests/integration-tests/tests/cfn-init/test_cfn_init.py @@ -0,0 +1,62 @@ +# Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). +# You may not use this file except in compliance with the License. +# A copy of the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "LICENSE.txt" file accompanying this file. +# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied. +# See the License for the specific language governing permissions and limitations under the License. + +import boto3 +import pytest + +from assertpy import assert_that +from remote_command_executor import RemoteCommandExecutor +from tests.common.assertions import assert_instance_replaced_or_terminating +from tests.common.compute_logs_common import wait_compute_log +from tests.common.schedulers_common import SlurmCommands + + +@pytest.mark.regions(["eu-central-1"]) +@pytest.mark.instances(["c5.xlarge"]) +@pytest.mark.schedulers(["slurm"]) +@pytest.mark.usefixtures("os", "instance", "scheduler") +def test_replace_compute_on_failure(region, pcluster_config_reader, clusters_factory, s3_bucket_factory, test_datadir): + """ + Test that compute nodes get replaced on userdata failures and logs get saved in shared directory. + + The failure is caused by a post_install script that exits with errors on compute nodes. + """ + bucket_name = s3_bucket_factory() + bucket = boto3.resource("s3", region_name=region).Bucket(bucket_name) + bucket.upload_file(str(test_datadir / "post_install.sh"), "post_install.sh") + cluster_config = pcluster_config_reader(bucket_name=bucket_name) + cluster = clusters_factory(cluster_config) + remote_command_executor = RemoteCommandExecutor(cluster) + + # submit a job to spin up a compute node that will fail due to post_install script + sge_commands = SlurmCommands(remote_command_executor) + sge_commands.submit_command("sleep 1") + instance_id = wait_compute_log(remote_command_executor) + + # extract logs and check one of them + _assert_compute_logs(remote_command_executor, instance_id) + + # check that instance got already replaced or is marked as Unhealthy + assert_instance_replaced_or_terminating(instance_id, region) + + +def _assert_compute_logs(remote_command_executor, instance_id): + remote_command_executor.run_remote_command( + "tar -xf /home/logs/compute/{0}.tar.gz --directory /tmp".format(instance_id) + ) + remote_command_executor.run_remote_command("test -f /tmp/var/log/cfn-init.log") + output = remote_command_executor.run_remote_command( + 'find /tmp/var/log -type f | xargs grep "Reporting instance as unhealthy and dumping logs to"', + hide=True, + login_shell=False, + ).stdout + assert_that(output).is_not_empty() diff --git a/tests/integration-tests/tests/cfn-init/test_cfn_init/test_replace_compute_on_failure/pcluster.config.ini b/tests/integration-tests/tests/cfn-init/test_cfn_init/test_replace_compute_on_failure/pcluster.config.ini new file mode 100644 index 0000000000..27ca7e1b1e --- /dev/null +++ b/tests/integration-tests/tests/cfn-init/test_cfn_init/test_replace_compute_on_failure/pcluster.config.ini @@ -0,0 +1,20 @@ +[global] +cluster_template = default + +[aws] +aws_region_name = {{ region }} + +[cluster default] +base_os = {{ os }} +key_name = {{ key_name }} +vpc_settings = parallelcluster-vpc +scheduler = {{ scheduler }} +master_instance_type = {{ instance }} +compute_instance_type = {{ instance }} +initial_queue_size = 0 +s3_read_resource = arn:aws:s3:::{{ bucket_name }}/* +post_install = s3://{{ bucket_name }}/post_install.sh + +[vpc parallelcluster-vpc] +vpc_id = {{ vpc_id }} +master_subnet_id = {{ public_subnet_id }} diff --git a/tests/integration-tests/tests/cfn-init/test_cfn_init/test_replace_compute_on_failure/post_install.sh b/tests/integration-tests/tests/cfn-init/test_cfn_init/test_replace_compute_on_failure/post_install.sh new file mode 100755 index 0000000000..7fe90d7d00 --- /dev/null +++ b/tests/integration-tests/tests/cfn-init/test_cfn_init/test_replace_compute_on_failure/post_install.sh @@ -0,0 +1,13 @@ +#!/bin/bash +. "/etc/parallelcluster/cfnconfig" + +case "${cfn_node_type}" in + MasterServer) + exit 0 + ;; + ComputeFleet) + exit 1 + ;; + *) + ;; +esac diff --git a/tests/integration-tests/tests/common/assertions.py b/tests/integration-tests/tests/common/assertions.py new file mode 100644 index 0000000000..7074da1f2d --- /dev/null +++ b/tests/integration-tests/tests/common/assertions.py @@ -0,0 +1,60 @@ +# Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance +# with the License. A copy of the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES +# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and +# limitations under the License. +import boto3 + +from assertpy import assert_that +from tests.common.scaling_common import get_compute_nodes_allocation +from time_utils import minutes + + +def assert_instance_replaced_or_terminating(instance_id, region): + """Assert that a given instance got replaced or is marked as Unhealthy.""" + response = boto3.client("autoscaling", region_name=region).describe_auto_scaling_instances( + InstanceIds=[instance_id] + ) + assert_that( + not response["AutoScalingInstances"] + or response["AutoScalingInstances"][0]["LifecycleState"] == "Terminating" + or response["AutoScalingInstances"][0]["HealthStatus"] == "UNHEALTHY" + ).is_true() + + +def assert_asg_desired_capacity(region, asg_name, expected): + asg_client = boto3.client("autoscaling", region_name=region) + asg = asg_client.describe_auto_scaling_groups(AutoScalingGroupNames=[asg_name]).get("AutoScalingGroups")[0] + assert_that(asg.get("DesiredCapacity")).is_equal_to(expected) + + +def assert_no_errors_in_logs(remote_command_executor, log_files): + __tracebackhide__ = True + for log_file in log_files: + log = remote_command_executor.run_remote_command("cat {0}".format(log_file), hide=True).stdout + for error_level in ["CRITICAL", "ERROR"]: + assert_that(log).does_not_contain(error_level) + + +def assert_scaling_worked(scheduler_commands, region, stack_name, scaledown_idletime, expected_max, expected_final): + jobs_execution_time = 1 + estimated_scaleup_time = 5 + max_scaledown_time = 10 + asg_capacity_time_series, compute_nodes_time_series, _ = get_compute_nodes_allocation( + scheduler_commands=scheduler_commands, + region=region, + stack_name=stack_name, + max_monitoring_time=minutes(jobs_execution_time) + + minutes(scaledown_idletime) + + minutes(estimated_scaleup_time) + + minutes(max_scaledown_time), + ) + assert_that(max(asg_capacity_time_series)).is_equal_to(expected_max) + assert_that(max(compute_nodes_time_series)).is_equal_to(expected_max) + assert_that(asg_capacity_time_series[-1]).is_equal_to(expected_final) + assert_that(compute_nodes_time_series[-1]).is_equal_to(expected_final) diff --git a/tests/integration-tests/tests/common/compute_logs_common.py b/tests/integration-tests/tests/common/compute_logs_common.py new file mode 100644 index 0000000000..df6b43abf2 --- /dev/null +++ b/tests/integration-tests/tests/common/compute_logs_common.py @@ -0,0 +1,27 @@ +# Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance +# with the License. A copy of the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES +# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and +# limitations under the License. +from retrying import retry + +from remote_command_executor import RemoteCommandExecutionError +from time_utils import minutes, seconds + + +@retry( + retry_on_exception=lambda exception: isinstance(exception, RemoteCommandExecutionError), + wait_fixed=seconds(30), + stop_max_delay=minutes(10), +) +def wait_compute_log(remote_command_executor): + remote_command_executor.run_remote_command("test -d /home/logs/compute", log_error=False) + # return instance-id + return remote_command_executor.run_remote_command( + "find /home/logs/compute/ -type f -printf '%f\\n' -quit | head -1 | cut -d. -f1", log_error=False + ).stdout diff --git a/tests/integration-tests/tests/common/scaling_common.py b/tests/integration-tests/tests/common/scaling_common.py index 6f0da610e6..0ad64467d3 100644 --- a/tests/integration-tests/tests/common/scaling_common.py +++ b/tests/integration-tests/tests/common/scaling_common.py @@ -43,7 +43,7 @@ def get_compute_nodes_allocation(scheduler_commands, region, stack_name, max_mon ) def _watch_compute_nodes_allocation(): compute_nodes = scheduler_commands.compute_nodes_count() - asg_capacity = _get_desired_asg_capacity(region, stack_name) + asg_capacity = get_desired_asg_capacity(region, stack_name) timestamp = time.time() # add values only if there is a transition. @@ -71,10 +71,53 @@ def _watch_compute_nodes_allocation(): return asg_capacity_time_series, compute_nodes_time_series, timestamps -def _get_desired_asg_capacity(region, stack_name): - """Retrieve the desired capacity of the autoscaling group for a specific cluster.""" +def watch_compute_nodes(scheduler_commands, max_monitoring_time, number_of_nodes): + """Watch periodically the number of nodes seen by the scheduler.""" + compute_nodes_time_series = [] + timestamps = [] + + @retry( + # Retry until the given number_of_nodes is equal to the number of compute nodes + retry_on_result=lambda _: compute_nodes_time_series[-1] != number_of_nodes, + wait_fixed=seconds(20), + stop_max_delay=max_monitoring_time, + ) + def _watch_compute_nodes_allocation(): + compute_nodes = scheduler_commands.compute_nodes_count() + timestamp = time.time() + + # add values only if there is a transition. + if len(compute_nodes_time_series) == 0 or compute_nodes_time_series[-1] != compute_nodes: + compute_nodes_time_series.append(compute_nodes) + timestamps.append(timestamp) + + try: + _watch_compute_nodes_allocation() + except RetryError: + # ignoring this error in order to perform assertions on the collected data. + pass + + logging.info( + "Monitoring completed: %s, %s", + "compute_nodes_time_series [" + " ".join(map(str, compute_nodes_time_series)) + "]", + "timestamps [" + " ".join(map(str, timestamps)) + "]", + ) + + +def _get_asg(region, stack_name): + """Retrieve the autoscaling group for a specific cluster.""" asg_conn = boto3.client("autoscaling", region_name=region) tags = asg_conn.describe_tags(Filters=[{"Name": "value", "Values": [stack_name]}]) asg_name = tags.get("Tags")[0].get("ResourceId") response = asg_conn.describe_auto_scaling_groups(AutoScalingGroupNames=[asg_name]) - return response["AutoScalingGroups"][0]["DesiredCapacity"] + return response["AutoScalingGroups"][0] + + +def get_desired_asg_capacity(region, stack_name): + """Retrieve the desired capacity of the autoscaling group for a specific cluster.""" + return _get_asg(region, stack_name)["DesiredCapacity"] + + +def get_max_asg_capacity(region, stack_name): + """Retrieve the max capacity of the autoscaling group for a specific cluster.""" + return _get_asg(region, stack_name)["MaxSize"] diff --git a/tests/integration-tests/tests/common/schedulers_common.py b/tests/integration-tests/tests/common/schedulers_common.py index 8b5dabb97f..7fe2762c18 100644 --- a/tests/integration-tests/tests/common/schedulers_common.py +++ b/tests/integration-tests/tests/common/schedulers_common.py @@ -8,7 +8,7 @@ # or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES # OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and # limitations under the License. - +import os import re from abc import ABCMeta, abstractmethod @@ -56,7 +56,7 @@ def get_job_exit_status(self, job_id): pass @abstractmethod - def submit_command(self, command, nodes=1): + def submit_command(self, command, nodes=1, slots=None): """ Submit a job to the scheduler. @@ -65,6 +65,16 @@ def submit_command(self, command, nodes=1): """ pass + @abstractmethod + def submit_script(self, script, nodes=1, slots=None, additional_files=None): + """ + Submit a job to the scheduler by using a script file. + + :param script: script to submit. + :return: result from remote command execution. + """ + pass + @abstractmethod def assert_job_succeeded(self, job_id, children_number=0): """ @@ -105,9 +115,12 @@ def assert_job_submitted(self, awsbsub_output): # noqa: D102 assert_that(match).is_not_none() return match.group(1) - def submit_command(self, command, nodes=1): # noqa: D102 + def submit_command(self, command, nodes=1, slots=None): # noqa: D102 return self._remote_command_executor.run_remote_command('echo "{0}" | awsbsub -n {1}'.format(command, nodes)) + def submit_script(self, script, nodes=1, additional_files=None, slots=None): # noqa: D102 + raise NotImplementedError + def assert_job_succeeded(self, job_id, children_number=0): # noqa: D102 __tracebackhide__ = True status = self.get_job_exit_status(job_id) @@ -117,6 +130,9 @@ def assert_job_succeeded(self, job_id, children_number=0): # noqa: D102 def compute_nodes_count(self): # noqa: D102 raise NotImplementedError + def get_compute_nodes(self): # noqa: D102 + raise NotImplementedError + class SgeCommands(SchedulerCommands): """Implement commands for sge scheduler.""" @@ -135,15 +151,39 @@ def get_job_exit_status(self, job_id): # noqa: D102 assert_that(match).is_not_none() return match.group(1) - def assert_job_submitted(self, qsub_output): # noqa: D102 + def assert_job_submitted(self, qsub_output, is_array=False): # noqa: D102 __tracebackhide__ = True - match = re.search(r"Your job ([0-9]+) \(.+\) has been submitted", qsub_output) + if is_array: + regex = r"Your job-array ([0-9]+)\.[0-9\-:]+ \(.+\) has been submitted" + else: + regex = r"Your job ([0-9]+) \(.+\) has been submitted" + match = re.search(regex, qsub_output) assert_that(match).is_not_none() return match.group(1) - def submit_command(self, command, nodes=1): # noqa: D102 - # TODO add support for multiple nodes - return self._remote_command_executor.run_remote_command("echo '{0}' | qsub".format(command)) + def submit_command(self, command, nodes=1, slots=None, hold=False): # noqa: D102 + flags = "" + if nodes != 1: + raise Exception("SGE does not support nodes option") + if slots: + flags += "-pe mpi {0} ".format(slots) + if hold: + flags += "-h " + return self._remote_command_executor.run_remote_command( + "echo '{0}' | qsub {1}".format(command, flags), raise_on_error=False + ) + + def submit_script(self, script, nodes=1, slots=None, additional_files=None): # noqa: D102 + if not additional_files: + additional_files = [] + additional_files.append(script) + flags = "" + if slots: + flags += "-pe mpi {0} ".format(slots) + script_name = os.path.basename(script) + return self._remote_command_executor.run_remote_command( + "qsub {0} {1}".format(flags, script_name), additional_files=additional_files + ) def assert_job_succeeded(self, job_id, children_number=0): # noqa: D102 __tracebackhide__ = True @@ -155,6 +195,10 @@ def compute_nodes_count(self): # noqa: D102 # split()[-1] to extract last line and trim whitespaces return int(result.stdout.split()[-1]) + def get_compute_nodes(self): # noqa: D102 + result = self._remote_command_executor.run_remote_command("qhost | grep ip- | awk '{print $1}'") + return result.stdout.splitlines() + class SlurmCommands(SchedulerCommands): """Implement commands for slurm scheduler.""" @@ -179,18 +223,44 @@ def assert_job_submitted(self, sbatch_output): # noqa: D102 assert_that(match).is_not_none() return match.group(1) - def submit_command(self, command, nodes=1): # noqa: D102 - return self._remote_command_executor.run_remote_command("sbatch -N {0} --wrap='{1}'".format(nodes, command)) + def submit_command(self, command, nodes=1, slots=None, host=None): # noqa: D102 + submission_command = "sbatch -N {0} --wrap='{1}'".format(nodes, command) + if host: + submission_command += " --nodelist={0}".format(host) + if slots: + submission_command += " -n {0}".format(slots) + return self._remote_command_executor.run_remote_command(submission_command) + + def submit_script(self, script, nodes=1, slots=None, host=None, additional_files=None): # noqa: D102 + if not additional_files: + additional_files = [] + additional_files.append(script) + script_name = os.path.basename(script) + submission_command = "sbatch" + if host: + submission_command += " --nodelist={0}".format(host) + if slots: + submission_command += " -n {0}".format(slots) + if nodes > 1: + submission_command += " -N {0}".format(slots) + submission_command += " {1}".format(nodes, script_name) + return self._remote_command_executor.run_remote_command(submission_command, additional_files=additional_files) def assert_job_succeeded(self, job_id, children_number=0): # noqa: D102 result = self._remote_command_executor.run_remote_command("scontrol show jobs -o {0}".format(job_id)) - return "JobState=COMPLETED" in result.stdout + assert_that(result.stdout).contains("JobState=COMPLETED") def compute_nodes_count(self): # noqa: D102 result = self._remote_command_executor.run_remote_command("sinfo --Node --noheader | grep compute | wc -l") # split()[-1] to extract last line and trim whitespaces return int(result.stdout.split()[-1]) + def get_compute_nodes(self): # noqa: D102 + result = self._remote_command_executor.run_remote_command( + "sinfo --Node --noheader | grep compute | awk '{print $1}'" + ) + return result.stdout.splitlines() + class TorqueCommands(SchedulerCommands): """Implement commands for torque scheduler.""" @@ -210,6 +280,9 @@ def assert_job_submitted(self, qsub_output): # noqa: D102 def submit_command(self, command): # noqa: D102 raise NotImplementedError + def submit_script(self, script, nodes=1): # noqa: D102 + raise NotImplementedError + def assert_job_succeeded(self, job_id, children_number=0): # noqa: D102 raise NotImplementedError @@ -220,6 +293,9 @@ def compute_nodes_count(self): # noqa: D102 # split()[-1] to extract last line and trim whitespaces return int(result.stdout.split()[-1]) + def get_compute_nodes(self): # noqa: D102 + raise NotImplementedError + def get_scheduler_commands(scheduler, remote_command_executor): scheduler_commands = { diff --git a/tests/integration-tests/tests/schedulers/test_awsbatch.py b/tests/integration-tests/tests/schedulers/test_awsbatch.py index 17b0ad6e88..4c278bcf59 100644 --- a/tests/integration-tests/tests/schedulers/test_awsbatch.py +++ b/tests/integration-tests/tests/schedulers/test_awsbatch.py @@ -18,7 +18,7 @@ from tests.common.schedulers_common import AWSBatchCommands -@pytest.mark.regions(["us-east-1", "eu-west-1"]) +@pytest.mark.regions(["eu-west-1"]) @pytest.mark.instances(["c5.xlarge", "t2.large"]) @pytest.mark.dimensions("*", "*", "alinux", "awsbatch") @pytest.mark.usefixtures("region", "os", "instance", "scheduler") diff --git a/tests/integration-tests/tests/schedulers/test_awsbatch/test_awsbatch/pcluster.config.ini b/tests/integration-tests/tests/schedulers/test_awsbatch/test_awsbatch/pcluster.config.ini index 360de75fd0..bd9a0d9d3b 100644 --- a/tests/integration-tests/tests/schedulers/test_awsbatch/test_awsbatch/pcluster.config.ini +++ b/tests/integration-tests/tests/schedulers/test_awsbatch/test_awsbatch/pcluster.config.ini @@ -9,6 +9,7 @@ base_os = {{ os }} key_name = {{ key_name }} vpc_settings = parallelcluster-vpc scheduler = awsbatch +master_instance_type = {{ instance }} compute_instance_type = {{ instance }} min_vcpus = 2 desired_vcpus = 2 diff --git a/tests/integration-tests/tests/schedulers/test_sge.py b/tests/integration-tests/tests/schedulers/test_sge.py new file mode 100644 index 0000000000..4f37c23783 --- /dev/null +++ b/tests/integration-tests/tests/schedulers/test_sge.py @@ -0,0 +1,124 @@ +# Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). +# You may not use this file except in compliance with the License. +# A copy of the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "LICENSE.txt" file accompanying this file. +# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied. +# See the License for the specific language governing permissions and limitations under the License. +import logging +import re + +import pytest + +from assertpy import assert_that +from remote_command_executor import RemoteCommandExecutor +from tests.common.assertions import assert_no_errors_in_logs, assert_scaling_worked +from tests.common.schedulers_common import SgeCommands + + +@pytest.mark.regions(["ap-southeast-1"]) +@pytest.mark.instances(["c5.xlarge"]) +@pytest.mark.schedulers(["sge"]) +@pytest.mark.usefixtures("os", "instance", "scheduler") +def test_sge(region, pcluster_config_reader, clusters_factory): + """ + Test all AWS SGE related features. + + Grouped all tests in a single function so that cluster can be reused for all of them. + """ + scaledown_idletime = 3 + max_queue_size = 5 + max_slots = 4 + cluster_config = pcluster_config_reader(scaledown_idletime=scaledown_idletime, max_queue_size=max_queue_size) + cluster = clusters_factory(cluster_config) + remote_command_executor = RemoteCommandExecutor(cluster) + + _test_sge_version(remote_command_executor) + _test_non_runnable_jobs(remote_command_executor, max_queue_size, max_slots, region, cluster, scaledown_idletime) + _test_job_dependencies(remote_command_executor, region, cluster.cfn_name, scaledown_idletime) + _test_job_arrays_and_parallel_jobs(remote_command_executor, region, cluster.cfn_name, scaledown_idletime) + # TODO: _test_dynamic_max_cluster_size + + assert_no_errors_in_logs(remote_command_executor, ["/var/log/sqswatcher", "/var/log/jobwatcher"]) + + +def _test_sge_version(remote_command_executor): + logging.info("Testing SGE Version") + version = remote_command_executor.run_remote_command("qstat -help | head -n 1").stdout + assert_that(version).is_equal_to("SGE 8.1.9") + + +def _test_non_runnable_jobs(remote_command_executor, max_queue_size, max_slots, region, cluster, scaledown_idletime): + logging.info("Testing jobs that violate scheduling requirements") + sge_commands = SgeCommands(remote_command_executor) + + # Make sure the cluster has at least 1 node in the queue so that we can verify cluster scales down correctly + if sge_commands.compute_nodes_count() == 0: + result = sge_commands.submit_command("sleep 1") + job_id = sge_commands.assert_job_submitted(result.stdout) + sge_commands.wait_job_completed(job_id) + assert_that(sge_commands.compute_nodes_count()).is_greater_than(0) + + logging.info("Testing cluster doesn't scale when job requires a capacity that is higher than the max available") + result = sge_commands.submit_command("sleep 1000", slots=(max_slots * max_queue_size) + 1) + max_slots_job_id = sge_commands.assert_job_submitted(result.stdout) + assert_that(_get_job_state(remote_command_executor, max_slots_job_id)).is_equal_to("qw") + + logging.info("Testing cluster doesn't scale when job is set on hold") + result = sge_commands.submit_command("sleep 1000", hold=True) + hold_job_id = sge_commands.assert_job_submitted(result.stdout) + assert_that(_get_job_state(remote_command_executor, hold_job_id)).is_equal_to("hqw") + + logging.info("Testing cluster scales down when pending jobs cannot be submitted") + assert_scaling_worked(sge_commands, region, cluster.cfn_name, scaledown_idletime, expected_max=1, expected_final=0) + # Assert jobs are still pending + pending_jobs = remote_command_executor.run_remote_command("qstat -s p | tail -n +3 | awk '{ print $1 }'").stdout + pending_jobs = pending_jobs.splitlines() + assert_that(pending_jobs).contains(max_slots_job_id, hold_job_id) + + +def _test_job_dependencies(remote_command_executor, region, stack_name, scaledown_idletime): + logging.info("Testing cluster doesn't scale when job dependencies are not satisfied") + sge_commands = SgeCommands(remote_command_executor) + result = sge_commands.submit_command("sleep 60", nodes=1) + job_id = sge_commands.assert_job_submitted(result.stdout) + result = remote_command_executor.run_remote_command( + "echo 'sleep 1' | qsub -hold_jid {0}".format(job_id), raise_on_error=False + ) + dependent_job_id = sge_commands.assert_job_submitted(result.stdout) + + assert_that(_get_job_state(remote_command_executor, dependent_job_id)).is_equal_to("hqw") + + # Assert scaling worked as expected + assert_scaling_worked(sge_commands, region, stack_name, scaledown_idletime, expected_max=1, expected_final=0) + # Assert jobs were completed + sge_commands.assert_job_succeeded(job_id) + sge_commands.assert_job_succeeded(dependent_job_id) + + +def _test_job_arrays_and_parallel_jobs(remote_command_executor, region, stack_name, scaledown_idletime): + logging.info("Testing cluster scales correctly with array jobs and parallel jobs") + sge_commands = SgeCommands(remote_command_executor) + + result = remote_command_executor.run_remote_command("echo 'sleep 1' | qsub -t 1-5", raise_on_error=False) + array_job_id = sge_commands.assert_job_submitted(result.stdout, is_array=True) + + result = remote_command_executor.run_remote_command("echo 'sleep 1' | qsub -pe mpi 4", raise_on_error=False) + parallel_job_id = sge_commands.assert_job_submitted(result.stdout) + + # Assert scaling worked as expected + assert_scaling_worked(sge_commands, region, stack_name, scaledown_idletime, expected_max=3, expected_final=0) + # Assert jobs were completed + sge_commands.assert_job_succeeded(array_job_id) + sge_commands.assert_job_succeeded(parallel_job_id) + + +def _get_job_state(remote_command_executor, job_id): + pending_jobs = remote_command_executor.run_remote_command("qstat | tail -n +3 | awk '{ print $1,$5 }'").stdout + match = re.search(r"{0} (\w+)".format(job_id), pending_jobs) + assert_that(match).is_not_none() + return match.group(1) diff --git a/tests/integration-tests/tests/schedulers/test_sge/test_sge/pcluster.config.ini b/tests/integration-tests/tests/schedulers/test_sge/test_sge/pcluster.config.ini new file mode 100644 index 0000000000..e0f1238bd3 --- /dev/null +++ b/tests/integration-tests/tests/schedulers/test_sge/test_sge/pcluster.config.ini @@ -0,0 +1,26 @@ +[global] +cluster_template = default + +[aws] +aws_region_name = {{ region }} + +[cluster default] +base_os = {{ os }} +key_name = {{ key_name }} +vpc_settings = parallelcluster-vpc +scheduler = sge +master_instance_type = {{ instance }} +compute_instance_type = {{ instance }} +initial_queue_size = 0 +max_queue_size = {{ max_queue_size }} +maintain_initial_size = false +scaling_settings = custom + +[scaling custom] +scaledown_idletime = {{ scaledown_idletime }} + +[vpc parallelcluster-vpc] +vpc_id = {{ vpc_id }} +master_subnet_id = {{ public_subnet_id }} +compute_subnet_id = {{ private_subnet_id }} +use_public_ips = false diff --git a/tests/integration-tests/tests/schedulers/test_slurm.py b/tests/integration-tests/tests/schedulers/test_slurm.py index 2b773d2bf5..4c68423c81 100644 --- a/tests/integration-tests/tests/schedulers/test_slurm.py +++ b/tests/integration-tests/tests/schedulers/test_slurm.py @@ -16,10 +16,9 @@ import pytest from assertpy import assert_that -from remote_command_executor import RemoteCommandExecutor -from tests.common.scaling_common import get_compute_nodes_allocation +from remote_command_executor import RemoteCommandExecutionError, RemoteCommandExecutor +from tests.common.assertions import assert_asg_desired_capacity, assert_no_errors_in_logs, assert_scaling_worked from tests.common.schedulers_common import SlurmCommands -from time_utils import minutes @pytest.mark.regions(["us-west-1"]) @@ -41,9 +40,12 @@ def test_slurm(region, pcluster_config_reader, clusters_factory): _test_slurm_version(remote_command_executor) _test_dynamic_max_cluster_size(remote_command_executor, region, cluster.asg) _test_cluster_limits(remote_command_executor, max_queue_size, region, cluster.asg) - _test_job_dependencies(remote_command_executor, region, cluster.cfn_name, scaledown_idletime) + _test_job_dependencies(remote_command_executor, region, cluster.cfn_name, scaledown_idletime, max_queue_size) + _test_job_arrays_and_parallel_jobs(remote_command_executor, region, cluster.cfn_name, scaledown_idletime) _test_dynamic_dummy_nodes(remote_command_executor, max_queue_size) + assert_no_errors_in_logs(remote_command_executor, ["/var/log/sqswatcher", "/var/log/jobwatcher"]) + def _test_slurm_version(remote_command_executor): logging.info("Testing Slurm Version") @@ -63,12 +65,14 @@ def _test_dynamic_max_cluster_size(remote_command_executor, region, asg_name): # Change ASG value and check dummy-nodes settings new_max_size = 1 asg_client.update_auto_scaling_group(AutoScalingGroupName=asg_name, MaxSize=new_max_size) - time.sleep(40) + # sleeping for 200 seconds since daemons fetch this data every 3 minutes + time.sleep(200) _assert_dummy_nodes(remote_command_executor, new_max_size) # Restore initial cluster size asg_client.update_auto_scaling_group(AutoScalingGroupName=asg_name, MaxSize=current_max_size) - time.sleep(40) + # sleeping for 200 seconds since daemons fetch this data every 3 minutes + time.sleep(200) _assert_dummy_nodes(remote_command_executor, current_max_size) @@ -82,7 +86,7 @@ def _test_dynamic_dummy_nodes(remote_command_executor, max_queue_size): _assert_dummy_nodes(remote_command_executor, max_queue_size - 1) -def _test_job_dependencies(remote_command_executor, region, stack_name, scaledown_idletime): +def _test_job_dependencies(remote_command_executor, region, stack_name, scaledown_idletime, max_queue_size): logging.info("Testing cluster doesn't scale when job dependencies are not satisfied") slurm_commands = SlurmCommands(remote_command_executor) result = slurm_commands.submit_command("sleep 60", nodes=1) @@ -98,38 +102,50 @@ def _test_job_dependencies(remote_command_executor, region, stack_name, scaledow ) assert_that(_get_job_info(remote_command_executor, dependent_job_id)).contains("JobState=PENDING Reason=Dependency") - jobs_execution_time = 1 - estimated_scaleup_time = 5 - estimated_scaledown_time = 20 - asg_capacity_time_series, compute_nodes_time_series, timestamps = get_compute_nodes_allocation( - scheduler_commands=slurm_commands, - region=region, - stack_name=stack_name, - max_monitoring_time=minutes(jobs_execution_time) - + minutes(scaledown_idletime) - + minutes(estimated_scaleup_time) - + minutes(estimated_scaledown_time), - ) - assert_that(max(asg_capacity_time_series)).is_equal_to(1) - assert_that(max(compute_nodes_time_series)).is_equal_to(1) - assert_that(asg_capacity_time_series[-1]).is_equal_to(0) - assert_that(compute_nodes_time_series[-1]).is_equal_to(0) + assert_scaling_worked(slurm_commands, region, stack_name, scaledown_idletime, expected_max=1, expected_final=0) + # Assert scheduler configuration is correct + _assert_dummy_nodes(remote_command_executor, max_queue_size) + assert_that(_retrieve_slurm_nodes_from_config(remote_command_executor)).is_empty() + # Assert jobs were completed + _assert_job_completed(remote_command_executor, job_id) + _assert_job_completed(remote_command_executor, dependent_job_id) def _test_cluster_limits(remote_command_executor, max_queue_size, region, asg_name): logging.info("Testing cluster doesn't scale when job requires a capacity that is higher than the max available") slurm_commands = SlurmCommands(remote_command_executor) - result = slurm_commands.submit_command("sleep 1", nodes=max_queue_size + 1) - job_id = slurm_commands.assert_job_submitted(result.stdout) - # Wait for reason to be computed - time.sleep(3) - assert_that(_get_job_info(remote_command_executor, job_id)).contains("JobState=PENDING Reason=PartitionNodeLimit") + result = slurm_commands.submit_command("sleep 1000", nodes=max_queue_size + 1) + max_nodes_job_id = slurm_commands.assert_job_submitted(result.stdout) + result = remote_command_executor.run_remote_command("sbatch -N 1 --wrap='sleep 1' --cpus-per-task 5") + max_cpu_job_id = slurm_commands.assert_job_submitted(result.stdout) # Check we are not scaling time.sleep(60) - asg_client = boto3.client("autoscaling", region_name=region) - asg = asg_client.describe_auto_scaling_groups(AutoScalingGroupNames=[asg_name]).get("AutoScalingGroups")[0] - assert_that(asg.get("DesiredCapacity")).is_equal_to(0) + assert_asg_desired_capacity(region, asg_name, expected=0) + assert_that(_get_job_info(remote_command_executor, max_nodes_job_id)).contains( + "JobState=PENDING Reason=PartitionNodeLimit" + ) + assert_that(_get_job_info(remote_command_executor, max_cpu_job_id)).contains( + "JobState=PENDING Reason=Nodes_required_for_job_are_DOWN,_DRAINED" + "_or_reserved_for_jobs_in_higher_priority_partitions" + ) + + +def _test_job_arrays_and_parallel_jobs(remote_command_executor, region, stack_name, scaledown_idletime): + logging.info("Testing cluster scales correctly with array jobs and parallel jobs") + slurm_commands = SlurmCommands(remote_command_executor) + + result = remote_command_executor.run_remote_command("sbatch --wrap 'sleep 1' -a 1-5") + array_job_id = slurm_commands.assert_job_submitted(result.stdout) + + result = remote_command_executor.run_remote_command("sbatch --wrap 'sleep 1' -c 3 -n 2") + parallel_job_id = slurm_commands.assert_job_submitted(result.stdout) + + # Assert scaling worked as expected + assert_scaling_worked(slurm_commands, region, stack_name, scaledown_idletime, expected_max=3, expected_final=0) + # Assert jobs were completed + _assert_job_completed(remote_command_executor, array_job_id) + _assert_job_completed(remote_command_executor, parallel_job_id) def _retrieve_slurm_dummy_nodes_from_config(remote_command_executor): @@ -137,6 +153,11 @@ def _retrieve_slurm_dummy_nodes_from_config(remote_command_executor): return remote_command_executor.run_remote_command(retrieve_dummy_nodes_command).stdout +def _retrieve_slurm_nodes_from_config(remote_command_executor): + retrieve_dummy_nodes_command = "sudo tail -n +2 /opt/slurm/etc/slurm_parallelcluster_nodes.conf" + return remote_command_executor.run_remote_command(retrieve_dummy_nodes_command).stdout + + def _retrieve_slurm_dummy_nodes(remote_command_executor): retrieve_dummy_nodes_command = "scontrol -F show nodes | grep 'State=FUTURE'" return len(remote_command_executor.run_remote_command(retrieve_dummy_nodes_command).stdout.split("\n")) @@ -145,10 +166,23 @@ def _retrieve_slurm_dummy_nodes(remote_command_executor): def _assert_dummy_nodes(remote_command_executor, count): __tracebackhide__ = True dummy_nodes_config = _retrieve_slurm_dummy_nodes_from_config(remote_command_executor) - assert_that(dummy_nodes_config).is_equal_to("NodeName=dummy-compute[1-{0}] CPUs=2048 State=FUTURE".format(count)) + # For the moment the test is enabled only on c5.xlarge, hence hardcoding slots for simplicity + slots = 4 + assert_that(dummy_nodes_config).is_equal_to( + "NodeName=dummy-compute[1-{0}] CPUs={1} State=FUTURE".format(count, slots) + ) dummy_nodes_count = _retrieve_slurm_dummy_nodes(remote_command_executor) assert_that(dummy_nodes_count).is_equal_to(count) def _get_job_info(remote_command_executor, job_id): return remote_command_executor.run_remote_command("scontrol show jobs -o {0}".format(job_id)).stdout + + +def _assert_job_completed(remote_command_executor, job_id): + try: + result = remote_command_executor.run_remote_command("scontrol show jobs -o {0}".format(job_id), log_error=False) + return "JobState=COMPLETED" in result.stdout + except RemoteCommandExecutionError as e: + # Handle the case when job is deleted from history + assert_that(e.result.stdout).contains("slurm_load_jobs error: Invalid job id specified") diff --git a/tests/integration-tests/tests/schedulers/test_slurm/test_slurm/pcluster.config.ini b/tests/integration-tests/tests/schedulers/test_slurm/test_slurm/pcluster.config.ini index fddc2bc7f5..50a7c4cf2f 100644 --- a/tests/integration-tests/tests/schedulers/test_slurm/test_slurm/pcluster.config.ini +++ b/tests/integration-tests/tests/schedulers/test_slurm/test_slurm/pcluster.config.ini @@ -22,3 +22,5 @@ scaledown_idletime = {{ scaledown_idletime }} [vpc parallelcluster-vpc] vpc_id = {{ vpc_id }} master_subnet_id = {{ public_subnet_id }} +compute_subnet_id = {{ private_subnet_id }} +use_public_ips = false diff --git a/tests/integration-tests/tests/storage/test_ebs.py b/tests/integration-tests/tests/storage/test_ebs.py index 07d685ff82..a71252cd0d 100644 --- a/tests/integration-tests/tests/storage/test_ebs.py +++ b/tests/integration-tests/tests/storage/test_ebs.py @@ -19,7 +19,7 @@ from tests.storage.storage_common import verify_directory_correctly_shared -@pytest.mark.regions(["us-east-1", "eu-west-1", "cn-north-1", "us-gov-west-1"]) +@pytest.mark.regions(["us-west-2", "cn-north-1", "us-gov-west-1"]) @pytest.mark.instances(["c4.xlarge", "c5.xlarge"]) @pytest.mark.schedulers(["sge", "awsbatch"]) @pytest.mark.usefixtures("region", "os", "instance") @@ -35,7 +35,8 @@ def test_ebs_single(scheduler, pcluster_config_reader, clusters_factory): _test_ebs_correctly_shared(remote_command_executor, mount_dir, scheduler_commands) -@pytest.mark.regions(["us-east-1", "eu-west-1", "cn-north-1", "us-gov-west-1"]) +# cn-north-1 does not support KMS +@pytest.mark.regions(["us-east-1", "us-gov-east-1"]) @pytest.mark.instances(["c5.xlarge"]) @pytest.mark.schedulers(["sge", "awsbatch"]) @pytest.mark.usefixtures("region", "os", "instance") @@ -52,8 +53,8 @@ def test_ebs_multiple(scheduler, pcluster_config_reader, clusters_factory): _test_ebs_correctly_shared(remote_command_executor, mount_dir, scheduler_commands) -@pytest.mark.regions(["us-east-1", "cn-north-1", "us-gov-west-1"]) -@pytest.mark.instances(["c5.xlarge"]) +@pytest.mark.regions(["eu-west-2", "cn-northwest-1", "us-gov-west-1"]) +@pytest.mark.instances(["c4.xlarge", "c5.xlarge"]) @pytest.mark.schedulers(["sge", "awsbatch"]) @pytest.mark.usefixtures("region", "os", "instance") def test_default_ebs(scheduler, pcluster_config_reader, clusters_factory): diff --git a/tests/integration-tests/tests/storage/test_ebs/test_default_ebs/pcluster.config.ini b/tests/integration-tests/tests/storage/test_ebs/test_default_ebs/pcluster.config.ini index 5732d70e6c..95dce375f3 100644 --- a/tests/integration-tests/tests/storage/test_ebs/test_default_ebs/pcluster.config.ini +++ b/tests/integration-tests/tests/storage/test_ebs/test_default_ebs/pcluster.config.ini @@ -9,6 +9,7 @@ base_os = {{ os }} key_name = {{ key_name }} vpc_settings = parallelcluster-vpc scheduler = {{ scheduler }} +master_instance_type = {{ instance }} compute_instance_type = {{ instance }} {% if scheduler == "awsbatch" %} min_vcpus = 4 diff --git a/tests/integration-tests/tests/storage/test_ebs/test_ebs_multiple/pcluster.config.ini b/tests/integration-tests/tests/storage/test_ebs/test_ebs_multiple/pcluster.config.ini index 6aea380572..ac36864f10 100644 --- a/tests/integration-tests/tests/storage/test_ebs/test_ebs_multiple/pcluster.config.ini +++ b/tests/integration-tests/tests/storage/test_ebs/test_ebs_multiple/pcluster.config.ini @@ -9,6 +9,7 @@ base_os = {{ os }} key_name = {{ key_name }} vpc_settings = parallelcluster-vpc scheduler = {{ scheduler }} +master_instance_type = {{ instance }} compute_instance_type = {{ instance }} {% if scheduler == "awsbatch" %} min_vcpus = 4 diff --git a/tests/integration-tests/tests/storage/test_ebs/test_ebs_single/pcluster.config.ini b/tests/integration-tests/tests/storage/test_ebs/test_ebs_single/pcluster.config.ini index 811370854f..995998e018 100644 --- a/tests/integration-tests/tests/storage/test_ebs/test_ebs_single/pcluster.config.ini +++ b/tests/integration-tests/tests/storage/test_ebs/test_ebs_single/pcluster.config.ini @@ -9,6 +9,7 @@ base_os = {{ os }} key_name = {{ key_name }} vpc_settings = parallelcluster-vpc scheduler = {{ scheduler }} +master_instance_type = {{ instance }} compute_instance_type = {{ instance }} {% if scheduler == "awsbatch" %} min_vcpus = 4 @@ -28,4 +29,3 @@ compute_subnet_id = {{ private_subnet_id }} shared_dir = {{ mount_dir }} volume_type = io1 volume_iops = 210 -encrypted = true diff --git a/tests/integration-tests/tests/storage/test_fsx_lustre.py b/tests/integration-tests/tests/storage/test_fsx_lustre.py index f9f3ce2805..d40bbddf08 100644 --- a/tests/integration-tests/tests/storage/test_fsx_lustre.py +++ b/tests/integration-tests/tests/storage/test_fsx_lustre.py @@ -19,7 +19,7 @@ from tests.common.schedulers_common import SgeCommands -@pytest.mark.regions(["us-east-1", "eu-west-1"]) +@pytest.mark.regions(["us-east-1"]) @pytest.mark.instances(["c5.xlarge"]) @pytest.mark.oss(["centos7", "alinux"]) @pytest.mark.schedulers(["sge"]) diff --git a/tests/integration-tests/tests/storage/test_fsx_lustre/test_fsx_lustre/pcluster.config.ini b/tests/integration-tests/tests/storage/test_fsx_lustre/test_fsx_lustre/pcluster.config.ini index bef59dc140..e12910c073 100644 --- a/tests/integration-tests/tests/storage/test_fsx_lustre/test_fsx_lustre/pcluster.config.ini +++ b/tests/integration-tests/tests/storage/test_fsx_lustre/test_fsx_lustre/pcluster.config.ini @@ -9,6 +9,7 @@ base_os = {{ os }} key_name = {{ key_name }} vpc_settings = parallelcluster-vpc scheduler = {{ scheduler }} +master_instance_type = {{ instance }} compute_instance_type = {{ instance }} {% if scheduler == "awsbatch" %} min_vcpus = 4 diff --git a/tests/integration-tests/tests/storage/test_raid.py b/tests/integration-tests/tests/storage/test_raid.py index f159be2306..7d9d4d7205 100644 --- a/tests/integration-tests/tests/storage/test_raid.py +++ b/tests/integration-tests/tests/storage/test_raid.py @@ -19,7 +19,7 @@ from tests.storage.storage_common import verify_directory_correctly_shared -@pytest.mark.regions(["us-east-1", "eu-west-1", "cn-north-1", "us-gov-west-1"]) +@pytest.mark.regions(["ap-south-1", "cn-northwest-1", "us-gov-east-1"]) @pytest.mark.instances(["c5.xlarge"]) @pytest.mark.schedulers(["sge", "awsbatch"]) @pytest.mark.usefixtures("region", "os", "instance") @@ -35,7 +35,7 @@ def test_raid_performance_mode(scheduler, pcluster_config_reader, clusters_facto _test_raid_correctly_shared(remote_command_executor, mount_dir, scheduler_commands) -@pytest.mark.regions(["us-east-1", "eu-west-1", "cn-north-1", "us-gov-west-1"]) +@pytest.mark.regions(["us-east-2", "cn-north-1", "us-gov-west-1"]) @pytest.mark.instances(["c5.xlarge"]) @pytest.mark.schedulers(["sge", "awsbatch"]) @pytest.mark.usefixtures("region", "os", "instance") diff --git a/tests/integration-tests/tests/storage/test_raid/test_raid_fault_tolerance_mode/pcluster.config.ini b/tests/integration-tests/tests/storage/test_raid/test_raid_fault_tolerance_mode/pcluster.config.ini index 6f79aa858f..fd70cf4e0e 100644 --- a/tests/integration-tests/tests/storage/test_raid/test_raid_fault_tolerance_mode/pcluster.config.ini +++ b/tests/integration-tests/tests/storage/test_raid/test_raid_fault_tolerance_mode/pcluster.config.ini @@ -9,6 +9,7 @@ base_os = {{ os }} key_name = {{ key_name }} vpc_settings = parallelcluster-vpc scheduler = {{ scheduler }} +master_instance_type = {{ instance }} compute_instance_type = {{ instance }} {% if scheduler == "awsbatch" %} min_vcpus = 4 diff --git a/tests/integration-tests/tests/storage/test_raid/test_raid_performance_mode/pcluster.config.ini b/tests/integration-tests/tests/storage/test_raid/test_raid_performance_mode/pcluster.config.ini index 3bba593301..f429ab5a35 100644 --- a/tests/integration-tests/tests/storage/test_raid/test_raid_performance_mode/pcluster.config.ini +++ b/tests/integration-tests/tests/storage/test_raid/test_raid_performance_mode/pcluster.config.ini @@ -9,6 +9,7 @@ base_os = {{ os }} key_name = {{ key_name }} vpc_settings = parallelcluster-vpc scheduler = {{ scheduler }} +master_instance_type = {{ instance }} compute_instance_type = {{ instance }} {% if scheduler == "awsbatch" %} min_vcpus = 4 diff --git a/tests/integration-tests/tests/test_efa/test_efa.py b/tests/integration-tests/tests/test_efa/test_efa.py new file mode 100644 index 0000000000..6fa0e071cb --- /dev/null +++ b/tests/integration-tests/tests/test_efa/test_efa.py @@ -0,0 +1,97 @@ +# Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). +# You may not use this file except in compliance with the License. +# A copy of the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "LICENSE.txt" file accompanying this file. +# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied. +# See the License for the specific language governing permissions and limitations under the License. +import logging +import re + +import pytest + +from assertpy import assert_that +from remote_command_executor import RemoteCommandExecutor +from tests.common.schedulers_common import get_scheduler_commands + +INSTANCES_TO_SLOTS_MAP = {"c5n.18xlarge": 72, "p3dn.24xlarge": 96, "i3en.24xlarge": 96} + + +@pytest.mark.regions(["us-east-1"]) +@pytest.mark.instances(["c5n.18xlarge", "p3dn.24xlarge", "i3en.24xlarge"]) +@pytest.mark.oss(["alinux", "centos7", "ubuntu1604"]) +@pytest.mark.schedulers(["sge", "slurm"]) +@pytest.mark.usefixtures("os", "region") +def test_efa(scheduler, instance, pcluster_config_reader, clusters_factory, test_datadir): + """ + Test all EFA Features. + + Grouped all tests in a single function so that cluster can be reused for all of them. + """ + max_queue_size = 2 + slots_per_instance = INSTANCES_TO_SLOTS_MAP[instance] + cluster_config = pcluster_config_reader(max_queue_size=max_queue_size) + cluster = clusters_factory(cluster_config) + remote_command_executor = RemoteCommandExecutor(cluster) + scheduler_commands = get_scheduler_commands(scheduler, remote_command_executor) + + _test_efa_installed(scheduler_commands, remote_command_executor) + _test_efa_mpi(remote_command_executor, scheduler_commands, test_datadir, slots_per_instance) + _test_osu_benchmarks(remote_command_executor, scheduler_commands, test_datadir, slots_per_instance) + + +def _test_efa_installed(scheduler_commands, remote_command_executor): + # Output contains: + # 00:06.0 Ethernet controller: Amazon.com, Inc. Device efa0 + logging.info("Testing EFA installed") + result = scheduler_commands.submit_command("lspci > /shared/lspci.out") + + job_id = scheduler_commands.assert_job_submitted(result.stdout) + scheduler_commands.wait_job_completed(job_id) + scheduler_commands.assert_job_succeeded(job_id) + + # Check EFA interface is present on compute node + result = remote_command_executor.run_remote_command("cat /shared/lspci.out") + assert_that(result.stdout).contains("00:06.0 Ethernet controller: Amazon.com, Inc. Device efa0") + + # Check EFA interface not present on master + result = remote_command_executor.run_remote_command("lspci") + assert_that(result.stdout).does_not_contain("00:06.0 Ethernet controller: Amazon.com, Inc. Device efa0") + + +def _test_efa_mpi(remote_command_executor, scheduler_commands, test_datadir, slots_per_instance): + logging.info("Testing mpi job with EFA") + # Compile mpi script + remote_command_executor.run_remote_command( + "/opt/amazon/efa/bin/mpicc -o mpi_hello_world mpi_hello_world.c", + additional_files=[str(test_datadir / "mpi_hello_world.c")], + ) + + # submit script using additional files + result = scheduler_commands.submit_script(str(test_datadir / "mpi_submit.sh"), slots=2 * slots_per_instance) + job_id = scheduler_commands.assert_job_submitted(result.stdout) + scheduler_commands.wait_job_completed(job_id) + scheduler_commands.assert_job_succeeded(job_id) + + mpi_out = remote_command_executor.run_remote_command("cat /shared/mpi.out").stdout + assert_that(mpi_out.splitlines()).is_length(2) + assert_that(mpi_out).matches(r"Hello world from processor ip-.+, rank 0 out of 2 processors") + assert_that(mpi_out).matches(r"Hello world from processor ip-.+, rank 1 out of 2 processors") + + +def _test_osu_benchmarks(remote_command_executor, scheduler_commands, test_datadir, slots_per_instance): + logging.info("Running OSU benchmarks") + remote_command_executor.run_remote_script(str(test_datadir / "init_osu_benchmarks.sh"), hide=True) + + result = scheduler_commands.submit_script(str(test_datadir / "osu_submit.sh"), slots=2 * slots_per_instance) + job_id = scheduler_commands.assert_job_submitted(result.stdout) + scheduler_commands.wait_job_completed(job_id) + scheduler_commands.assert_job_succeeded(job_id) + + output = remote_command_executor.run_remote_command("cat /shared/osu.out").stdout + latency = re.search(r"0\s+(\d\d)\.", output).group(1) + assert_that(int(latency)).is_less_than(20) diff --git a/tests/integration-tests/tests/test_efa/test_efa/test_efa/init_osu_benchmarks.sh b/tests/integration-tests/tests/test_efa/test_efa/test_efa/init_osu_benchmarks.sh new file mode 100644 index 0000000000..cc912c9dc8 --- /dev/null +++ b/tests/integration-tests/tests/test_efa/test_efa/test_efa/init_osu_benchmarks.sh @@ -0,0 +1,9 @@ +#!/usr/bin/env bash +set -e + +cd /shared +wget http://mvapich.cse.ohio-state.edu/download/mvapich/osu-micro-benchmarks-5.4.tar.gz +tar zxvf ./osu-micro-benchmarks-5.4.tar.gz +cd osu-micro-benchmarks-5.4/ +./configure CC=/opt/amazon/efa/bin/mpicc CXX=/opt/amazon/efa/bin/mpicxx +make \ No newline at end of file diff --git a/tests/integration-tests/tests/test_efa/test_efa/test_efa/mpi_hello_world.c b/tests/integration-tests/tests/test_efa/test_efa/test_efa/mpi_hello_world.c new file mode 100644 index 0000000000..0f0b0252a2 --- /dev/null +++ b/tests/integration-tests/tests/test_efa/test_efa/test_efa/mpi_hello_world.c @@ -0,0 +1,35 @@ +// Copyright 2011 www.mpitutorial.com +// +// An intro MPI hello world program that uses MPI_Init, MPI_Comm_size, +// MPI_Comm_rank, MPI_Finalize, and MPI_Get_processor_name. +// +#include +#include +#include + +int main(int argc, char** argv) { + // Initialize the MPI environment. The two arguments to MPI Init are not + // currently used by MPI implementations, but are there in case future + // implementations might need the arguments. + MPI_Init(NULL, NULL); + + // Get the number of processes + int world_size; + MPI_Comm_size(MPI_COMM_WORLD, &world_size); + + // Get the rank of the process + int world_rank; + MPI_Comm_rank(MPI_COMM_WORLD, &world_rank); + + // Get the name of the processor + char processor_name[MPI_MAX_PROCESSOR_NAME]; + int name_len; + MPI_Get_processor_name(processor_name, &name_len); + + // Print off a hello world message + printf("Hello world from processor %s, rank %d out of %d processors\n", + processor_name, world_rank, world_size); + + // Finalize the MPI environment. No more MPI calls can be made after this + MPI_Finalize(); +} diff --git a/tests/integration-tests/tests/test_efa/test_efa/test_efa/mpi_submit.sh b/tests/integration-tests/tests/test_efa/test_efa/test_efa/mpi_submit.sh new file mode 100644 index 0000000000..1f48445e1e --- /dev/null +++ b/tests/integration-tests/tests/test_efa/test_efa/test_efa/mpi_submit.sh @@ -0,0 +1,5 @@ +#!/bin/bash +set -e + +module load openmpi +mpirun -N 1 -np 2 "mpi_hello_world" >> /shared/mpi.out diff --git a/tests/integration-tests/tests/test_efa/test_efa/test_efa/osu_submit.sh b/tests/integration-tests/tests/test_efa/test_efa/test_efa/osu_submit.sh new file mode 100644 index 0000000000..39394ea744 --- /dev/null +++ b/tests/integration-tests/tests/test_efa/test_efa/test_efa/osu_submit.sh @@ -0,0 +1,5 @@ +#!/bin/bash +set -e + +module load openmpi +mpirun --map-by ppr:1:node /shared/osu-micro-benchmarks-5.4/mpi/pt2pt/osu_latency >> /shared/osu.out diff --git a/tests/integration-tests/tests/test_efa/test_efa/test_efa/pcluster.config.ini b/tests/integration-tests/tests/test_efa/test_efa/test_efa/pcluster.config.ini new file mode 100644 index 0000000000..2722fee64d --- /dev/null +++ b/tests/integration-tests/tests/test_efa/test_efa/test_efa/pcluster.config.ini @@ -0,0 +1,23 @@ +[global] +cluster_template = default + +[aws] +aws_region_name = {{ region }} + +[cluster default] +base_os = {{ os }} +key_name = {{ key_name }} +vpc_settings = parallelcluster-vpc +scheduler = {{ scheduler }} +master_instance_type = c5.xlarge +compute_instance_type = {{ instance }} +initial_queue_size = 2 +maintain_initial_size = true +max_queue_size = {{ max_queue_size }} +enable_efa = compute +placement_group = DYNAMIC + +[vpc parallelcluster-vpc] +vpc_id = {{ vpc_id }} +master_subnet_id = {{ public_subnet_id }} +compute_subnet_id = {{ private_subnet_id }} diff --git a/tests/integration-tests/tests/test_scaling.py b/tests/integration-tests/tests/test_scaling.py index 24e4d2d218..3761ebcb3b 100644 --- a/tests/integration-tests/tests/test_scaling.py +++ b/tests/integration-tests/tests/test_scaling.py @@ -12,15 +12,19 @@ import logging import pytest +from retrying import retry from assertpy import assert_that from remote_command_executor import RemoteCommandExecutionError, RemoteCommandExecutor -from tests.common.scaling_common import get_compute_nodes_allocation +from tests.common.assertions import assert_instance_replaced_or_terminating, assert_no_errors_in_logs +from tests.common.compute_logs_common import wait_compute_log +from tests.common.scaling_common import get_compute_nodes_allocation, get_desired_asg_capacity from tests.common.schedulers_common import get_scheduler_commands -from time_utils import minutes +from time_utils import minutes, seconds @pytest.mark.skip_schedulers(["awsbatch"]) +@pytest.mark.skip_instances(["c5n.18xlarge", "p3dn.24xlarge", "i3en.24xlarge"]) @pytest.mark.usefixtures("region", "os", "instance") def test_multiple_jobs_submission(scheduler, region, pcluster_config_reader, clusters_factory, test_datadir): scaledown_idletime = 4 @@ -55,6 +59,50 @@ def test_multiple_jobs_submission(scheduler, region, pcluster_config_reader, clu expected_compute_nodes=(0, 3), ) + logging.info("Verifying no error in logs") + assert_no_errors_in_logs(remote_command_executor, ["/var/log/sqswatcher", "/var/log/jobwatcher"]) + + +@pytest.mark.regions(["sa-east-1"]) +@pytest.mark.instances(["c5.xlarge"]) +@pytest.mark.schedulers(["slurm", "sge"]) +@pytest.mark.usefixtures("region", "os", "instance") +@pytest.mark.nodewatcher +def test_nodewatcher_terminates_failing_node(scheduler, region, pcluster_config_reader, clusters_factory, test_datadir): + cluster_config = pcluster_config_reader() + cluster = clusters_factory(cluster_config) + remote_command_executor = RemoteCommandExecutor(cluster) + scheduler_commands = get_scheduler_commands(scheduler, remote_command_executor) + + compute_nodes = scheduler_commands.get_compute_nodes() + + # submit a job that kills the slurm daemon so that the node enters a failing state + scheduler_commands.submit_script(str(test_datadir / "{0}_kill_scheduler_job.sh".format(scheduler))) + instance_id = wait_compute_log(remote_command_executor) + + _assert_compute_logs(remote_command_executor, instance_id) + assert_instance_replaced_or_terminating(instance_id, region) + # verify that desired capacity is still 1 + assert_that(get_desired_asg_capacity(region, cluster.cfn_name)).is_equal_to(1) + _assert_nodes_removed_from_scheduler(scheduler_commands, compute_nodes) + + assert_no_errors_in_logs(remote_command_executor, ["/var/log/sqswatcher", "/var/log/jobwatcher"]) + + +@retry(wait_fixed=seconds(20), stop_max_delay=minutes(5)) +def _assert_nodes_removed_from_scheduler(scheduler_commands, nodes): + assert_that(scheduler_commands.get_compute_nodes()).does_not_contain(*nodes) + + +def _assert_compute_logs(remote_command_executor, instance_id): + remote_command_executor.run_remote_command( + "tar -xf /home/logs/compute/{0}.tar.gz --directory /tmp".format(instance_id) + ) + remote_command_executor.run_remote_command("test -f /tmp/var/log/nodewatcher") + messages_log = remote_command_executor.run_remote_command("cat /tmp/var/log/nodewatcher", hide=True).stdout + assert_that(messages_log).contains("Node is marked as down by scheduler or not attached correctly. Terminating...") + assert_that(messages_log).contains("Dumping logs to /home/logs/compute/{0}.tar.gz".format(instance_id)) + def _assert_scaling_works( asg_capacity_time_series, compute_nodes_time_series, expected_asg_capacity, expected_compute_nodes diff --git a/tests/integration-tests/tests/test_scaling/test_nodewatcher_terminates_failing_node/pcluster.config.ini b/tests/integration-tests/tests/test_scaling/test_nodewatcher_terminates_failing_node/pcluster.config.ini new file mode 100644 index 0000000000..f30fa38db5 --- /dev/null +++ b/tests/integration-tests/tests/test_scaling/test_nodewatcher_terminates_failing_node/pcluster.config.ini @@ -0,0 +1,19 @@ +[global] +cluster_template = default + +[aws] +aws_region_name = {{ region }} + +[cluster default] +base_os = {{ os }} +key_name = {{ key_name }} +vpc_settings = parallelcluster-vpc +scheduler = {{ scheduler }} +master_instance_type = {{ instance }} +compute_instance_type = {{ instance }} +initial_queue_size = 1 +maintain_initial_size = true + +[vpc parallelcluster-vpc] +vpc_id = {{ vpc_id }} +master_subnet_id = {{ public_subnet_id }} diff --git a/tests/integration-tests/tests/test_scaling/test_nodewatcher_terminates_failing_node/sge_kill_scheduler_job.sh b/tests/integration-tests/tests/test_scaling/test_nodewatcher_terminates_failing_node/sge_kill_scheduler_job.sh new file mode 100755 index 0000000000..1a65342d6c --- /dev/null +++ b/tests/integration-tests/tests/test_scaling/test_nodewatcher_terminates_failing_node/sge_kill_scheduler_job.sh @@ -0,0 +1,15 @@ +#!/bin/bash +# Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). +# You may not use this file except in compliance with the License. +# A copy of the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "LICENSE.txt" file accompanying this file. +# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied. +# See the License for the specific language governing permissions and limitations under the License. +sudo /etc/init.d/sgeexecd.p6444 stop +# keep job up and running +sleep infinity diff --git a/tests/integration-tests/tests/test_scaling/test_nodewatcher_terminates_failing_node/slurm_kill_scheduler_job.sh b/tests/integration-tests/tests/test_scaling/test_nodewatcher_terminates_failing_node/slurm_kill_scheduler_job.sh new file mode 100755 index 0000000000..da42d09253 --- /dev/null +++ b/tests/integration-tests/tests/test_scaling/test_nodewatcher_terminates_failing_node/slurm_kill_scheduler_job.sh @@ -0,0 +1,15 @@ +#!/bin/bash +# Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). +# You may not use this file except in compliance with the License. +# A copy of the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "LICENSE.txt" file accompanying this file. +# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied. +# See the License for the specific language governing permissions and limitations under the License. +sudo kill $(ps aux | grep '[s]lurm' | awk '{print $2}') +# keep job up and running +sleep infinity diff --git a/tests/integration-tests/tests/update/__init__.py b/tests/integration-tests/tests/update/__init__.py new file mode 100644 index 0000000000..2251b11f46 --- /dev/null +++ b/tests/integration-tests/tests/update/__init__.py @@ -0,0 +1,11 @@ +# Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). +# You may not use this file except in compliance with the License. +# A copy of the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "LICENSE.txt" file accompanying this file. +# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied. +# See the License for the specific language governing permissions and limitations under the License. diff --git a/tests/integration-tests/tests/update/test_update.py b/tests/integration-tests/tests/update/test_update.py new file mode 100644 index 0000000000..f44b648c7e --- /dev/null +++ b/tests/integration-tests/tests/update/test_update.py @@ -0,0 +1,209 @@ +# Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). +# You may not use this file except in compliance with the License. +# A copy of the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "LICENSE.txt" file accompanying this file. +# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied. +# See the License for the specific language governing permissions and limitations under the License. +import time +from collections import namedtuple + +import boto3 +import pytest + +from assertpy import assert_that +from remote_command_executor import RemoteCommandExecutor +from tests.common.scaling_common import get_max_asg_capacity, watch_compute_nodes +from tests.common.schedulers_common import SlurmCommands +from time_utils import minutes + +PClusterConfig = namedtuple( + "PClusterConfig", + [ + "max_queue_size", + "compute_instance_type", + "compute_root_volume_size", + "s3_read_resource", + "s3_read_write_resource", + ], +) + + +@pytest.mark.dimensions("eu-west-1", "c5.xlarge", "alinux", "slurm") +@pytest.mark.usefixtures("os", "scheduler") +def test_update(instance, region, pcluster_config_reader, clusters_factory, test_datadir): + """ + Test 'pcluster update' command. + + Grouped all tests in a single function so that cluster can be reused for all of them. + """ + s3_arn = "arn:aws:s3:::fake_bucket/*" + init_config = PClusterConfig( + max_queue_size=5, + compute_instance_type=instance, + compute_root_volume_size=30, + s3_read_resource=s3_arn, + s3_read_write_resource=s3_arn, + ) + cluster = _init_cluster(clusters_factory, pcluster_config_reader, init_config) + command_executor = RemoteCommandExecutor(cluster) + slurm_commands = SlurmCommands(command_executor) + + _verify_initialization(command_executor, slurm_commands, region, test_datadir, cluster, init_config) + + s3_arn_updated = "arn:aws:s3:::fake_bucket/fake_folder/*" + updated_config = PClusterConfig( + max_queue_size=10, + compute_instance_type="c4.xlarge", + compute_root_volume_size=40, + s3_read_resource=s3_arn_updated, + s3_read_write_resource=s3_arn_updated, + ) + _update_cluster(cluster, updated_config) + + # verify updated parameters + _test_max_queue(region, cluster.cfn_name, updated_config.max_queue_size) + _test_s3_read_resource(region, cluster, updated_config.s3_read_resource) + _test_s3_read_write_resource(region, cluster, updated_config.s3_read_write_resource) + + # verify params that are NOT updated in OLD compute nodes + compute_nodes = slurm_commands.get_compute_nodes() + _test_compute_instance_type(region, cluster.cfn_name, init_config.compute_instance_type, compute_nodes[0]) + _test_compute_root_volume_size( + command_executor, slurm_commands, test_datadir, init_config.compute_root_volume_size, compute_nodes[0] + ) + # add compute nodes and verify updated params in NEW compute nodes + new_compute_nodes = _add_compute_nodes(slurm_commands) + _test_compute_instance_type(region, cluster.cfn_name, updated_config.compute_instance_type, new_compute_nodes[0]) + _test_compute_root_volume_size( + command_executor, slurm_commands, test_datadir, updated_config.compute_root_volume_size, new_compute_nodes[0] + ) + + +def _init_cluster(clusters_factory, pcluster_config_reader, config): + # read configuration and create cluster + cluster_config = pcluster_config_reader( + max_queue_size=config.max_queue_size, + compute_instance_type=config.compute_instance_type, + compute_root_volume_size=config.compute_root_volume_size, + s3_read_resource=config.s3_read_resource, + s3_read_write_resource=config.s3_read_write_resource, + ) + cluster = clusters_factory(cluster_config) + return cluster + + +def _verify_initialization(command_executor, slurm_commands, region, test_datadir, cluster, config): + # Verify initial settings + _test_max_queue(region, cluster.cfn_name, config.max_queue_size) + _test_s3_read_resource(region, cluster, config.s3_read_resource) + _test_s3_read_write_resource(region, cluster, config.s3_read_write_resource) + + # Verify Compute nodes initial settings + compute_nodes = slurm_commands.get_compute_nodes() + _test_compute_instance_type(region, cluster.cfn_name, config.compute_instance_type, compute_nodes[0]) + _test_compute_root_volume_size( + command_executor, slurm_commands, test_datadir, config.compute_root_volume_size, compute_nodes[0] + ) + + +def _update_cluster(cluster, config): + # change cluster.config settings + _update_cluster_property(cluster, "max_queue_size", str(config.max_queue_size)) + _update_cluster_property(cluster, "compute_instance_type", config.compute_instance_type) + _update_cluster_property(cluster, "compute_root_volume_size", str(config.compute_root_volume_size)) + _update_cluster_property(cluster, "s3_read_resource", config.s3_read_resource) + _update_cluster_property(cluster, "s3_read_write_resource", config.s3_read_write_resource) + # rewrite configuration file starting from the updated cluster.config object + with open(cluster.config_file, "w") as configfile: + cluster.config.write(configfile) + # update cluster + cluster.update() + + +def _update_cluster_property(cluster, property_name, property_value): + cluster.config.set("cluster default", property_name, property_value) + + +def _test_max_queue(region, stack_name, queue_size): + asg_max_size = get_max_asg_capacity(region, stack_name) + assert_that(asg_max_size).is_equal_to(queue_size) + + +def _add_compute_nodes(slurm_commands, number_of_nodes=1): + """ + Add new compute nodes to the cluster. + + It is required because some changes will be available only on new compute nodes. + :param cluster: the cluster + :param number_of_nodes: number of nodes to add + :return an array containing the new compute nodes only + """ + initial_compute_nodes = slurm_commands.get_compute_nodes() + + number_of_nodes = len(initial_compute_nodes) + number_of_nodes + # submit a job to perform a scaling up action and have new instances + result = slurm_commands.submit_command("sleep 1", nodes=number_of_nodes) + slurm_commands.assert_job_submitted(result.stdout) + + estimated_scaleup_time = 5 + watch_compute_nodes( + scheduler_commands=slurm_commands, + max_monitoring_time=minutes(estimated_scaleup_time), + number_of_nodes=number_of_nodes, + ) + + return [node for node in slurm_commands.get_compute_nodes() if node not in initial_compute_nodes] + + +def _test_compute_instance_type(region, stack_name, compute_instance_type, host): + hostname = "{0}.{1}.compute.internal".format(host, region) + ec2_resource = boto3.resource("ec2", region_name=region) + instance_types = [] + for instance in ec2_resource.instances.filter( + Filters=[ + {"Name": "tag:Application", "Values": [stack_name]}, + {"Name": "private-dns-name", "Values": [hostname]}, + ] + ): + instance_types.append(instance.instance_type) + + assert_that(instance_types).contains(compute_instance_type) + + +def _test_compute_root_volume_size(command_executor, slurm_commands, test_datadir, compute_root_volume_size, host): + # submit a job to retrieve compute root volume size and save in a file + result = slurm_commands.submit_script(str(test_datadir / "slurm_get_root_volume_size.sh"), host=host) + job_id = slurm_commands.assert_job_submitted(result.stdout) + slurm_commands.wait_job_completed(job_id) + slurm_commands.assert_job_succeeded(job_id) + + # read volume size from file + time.sleep(5) # wait a bit to be sure to have the file + result = command_executor.run_remote_command("cat /shared/{0}_root_volume_size.txt".format(host)) + assert_that(result.stdout).matches(r"{size}G".format(size=compute_root_volume_size)) + + +def _test_policy_statement(region, cluster, policy_name, policy_statement): + iam_client = boto3.client("iam", region_name=region) + root_role = cluster.cfn_resources.get("RootRole") + + statement = ( + iam_client.get_role_policy(RoleName=root_role, PolicyName=policy_name) + .get("PolicyDocument") + .get("Statement")[0] + .get("Resource")[0] + ) + assert_that(statement).is_equal_to(policy_statement) + + +def _test_s3_read_resource(region, cluster, s3_arn): + _test_policy_statement(region, cluster, "S3Read", s3_arn) + + +def _test_s3_read_write_resource(region, cluster, s3_arn): + _test_policy_statement(region, cluster, "S3ReadWrite", s3_arn) diff --git a/tests/integration-tests/tests/update/test_update/test_update/pcluster.config.ini b/tests/integration-tests/tests/update/test_update/test_update/pcluster.config.ini new file mode 100644 index 0000000000..dd4b2d9b49 --- /dev/null +++ b/tests/integration-tests/tests/update/test_update/test_update/pcluster.config.ini @@ -0,0 +1,27 @@ +[global] +cluster_template = default + +[aws] +aws_region_name = {{ region }} + +[cluster default] +base_os = {{ os }} +key_name = {{ key_name }} +vpc_settings = parallelcluster-vpc +scheduler = {{ scheduler }} +master_instance_type = {{ instance }} +compute_instance_type = {{ compute_instance_type }} +compute_root_volume_size = {{ compute_root_volume_size }} +initial_queue_size = 1 +max_queue_size = {{ max_queue_size }} +maintain_initial_size = true +scaling_settings = custom +s3_read_resource = {{ s3_read_resource }} +s3_read_write_resource = {{ s3_read_write_resource }} + +[scaling custom] +scaledown_idletime = 3 + +[vpc parallelcluster-vpc] +vpc_id = {{ vpc_id }} +master_subnet_id = {{ public_subnet_id }} diff --git a/tests/integration-tests/tests/update/test_update/test_update/slurm_get_root_volume_size.sh b/tests/integration-tests/tests/update/test_update/test_update/slurm_get_root_volume_size.sh new file mode 100644 index 0000000000..225dca8feb --- /dev/null +++ b/tests/integration-tests/tests/update/test_update/test_update/slurm_get_root_volume_size.sh @@ -0,0 +1,13 @@ +#!/bin/bash +# Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). +# You may not use this file except in compliance with the License. +# A copy of the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "LICENSE.txt" file accompanying this file. +# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied. +# See the License for the specific language governing permissions and limitations under the License. +df -h | grep '/$' | awk '{{print $2}}' > /shared/$(hostname)_root_volume_size.txt diff --git a/util/generate-ami-list.py b/util/generate-ami-list.py index 51db5da200..150e738644 100644 --- a/util/generate-ami-list.py +++ b/util/generate-ami-list.py @@ -37,7 +37,21 @@ ) -def get_ami_list(regions, date, cookbook_git_ref, node_git_ref, version, owner): +def get_ami_list_from_file(regions, cfn_template_file): + amis_json = {} + + with open(cfn_template_file) as cfn_file: + # object_pairs_hook=OrderedDict allows to preserve input order + cfn_data = json.load(cfn_file, object_pairs_hook=OrderedDict) + + current_amis = cfn_data.get("Mappings").get("AWSRegionOS2AMI") + + for region_name in regions: + amis_json[region_name] = OrderedDict(sorted(current_amis.get(region_name).items())) + return amis_json + + +def get_ami_list_from_ec2(regions, date, cookbook_git_ref, node_git_ref, version, owner): amis_json = {} for region_name in regions: @@ -48,6 +62,7 @@ def get_ami_list(regions, date, cookbook_git_ref, node_git_ref, version, owner): elif cookbook_git_ref and node_git_ref: filters.append({"Name": "tag:parallelcluster_cookbook_ref", "Values": ["%s" % cookbook_git_ref]}) filters.append({"Name": "tag:parallelcluster_node_ref", "Values": ["%s" % node_git_ref]}) + filters.append({"Name": "name", "Values": ["aws-parallelcluster-*"]}) else: print("Error: you can search for version and date or cookbook and node git reference") exit(1) @@ -83,7 +98,20 @@ def convert_json_to_txt(amis_json): return amis_txt -def get_all_aws_regions(region): +def get_aws_regions_from_file(region_file): + # Region file format + # { + # "regions": [ + # "cn-north-1", + # "cn-northwest-1" + # ] + # } + with open(region_file) as r_file: + region_data = json.load(r_file) + return sorted(r for r in region_data.get("regions")) + + +def get_all_aws_regions_from_ec2(region): ec2 = boto3.client("ec2", region_name=region) return sorted(r.get("RegionName") for r in ec2.describe_regions().get("Regions")) @@ -117,18 +145,23 @@ def update_amis_txt(amis_txt_file, amis): if __name__ == "__main__": # parse inputs parser = argparse.ArgumentParser(description="Get AWS ParallelCluster instances and generate a json and txt file") - group1 = parser.add_argument_group("Search by version and date") + group1 = parser.add_argument_group("Retrieve instances from EC2 searching by version and date") group1.add_argument("--version", type=str, help="release version", required=False) group1.add_argument("--date", type=str, help="release date [timestamp] (e.g. 201801112350)", required=False) - group2 = parser.add_argument_group("Search by cookbook and node git reference") + group2 = parser.add_argument_group("Retrieve instances from EC2 searching by cookbook and node git reference") group2.add_argument("--cookbook-git-ref", type=str, help="cookbook git hash reference", required=False) group2.add_argument("--node-git-ref", type=str, help="node git hash reference", required=False) + group3 = parser.add_argument_group("Retrieve instances from local cfn template for given regions") + group3.add_argument("--json-template", type=str, help="path to input json cloudformation template", required=False) + group3.add_argument( + "--json-regions", type=str, help="path to input json file containing the regions", required=False + ) parser.add_argument("--txt-file", type=str, help="txt output file path", required=False, default="amis.txt") parser.add_argument("--partition", type=str, help="commercial | china | govcloud", required=True) parser.add_argument( "--cloudformation-template", type=str, - help="path to cloudfomation template", + help="path to output cloudfomation template", required=False, default="cloudformation/aws-parallelcluster.cfn.json", ) @@ -147,17 +180,19 @@ def update_amis_txt(amis_txt_file, amis): print("Unsupported partition %s" % args.partition) sys.exit(1) - regions = get_all_aws_regions(region) - - amis_dict = get_ami_list( - regions=regions, - date=args.date, - cookbook_git_ref=args.cookbook_git_ref, - node_git_ref=args.node_git_ref, - version=args.version, - owner=account_id, - ) + if (args.version and args.date) or (args.cookbook_git_ref and args.node_git_ref): + regions = get_all_aws_regions_from_ec2(region) + amis_dict = get_ami_list_from_ec2( + regions=regions, + date=args.date, + cookbook_git_ref=args.cookbook_git_ref, + node_git_ref=args.node_git_ref, + version=args.version, + owner=account_id, + ) + else: + regions = get_aws_regions_from_file(args.json_regions) + amis_dict = get_ami_list_from_file(regions, args.json_template) cfn_amis = update_cfn_template(cfn_template_file=args.cloudformation_template, amis_to_update=amis_dict) - update_amis_txt(amis_txt_file=args.txt_file, amis=cfn_amis) diff --git a/util/batch-instance-whitelist.py b/util/instance-whitelist.py similarity index 85% rename from util/batch-instance-whitelist.py rename to util/instance-whitelist.py index e4aec9713e..76f183244c 100755 --- a/util/batch-instance-whitelist.py +++ b/util/instance-whitelist.py @@ -47,7 +47,7 @@ def get_all_aws_regions(partition): return set(sorted(r.get("RegionName") for r in ec2.describe_regions().get("Regions"))) - UNSUPPORTED_REGIONS -def get_instance_whitelist(args, region): +def get_batch_instance_whitelist(args, region): # try to create a dummy compute environmment batch_client = boto3.client("batch", region_name=region) @@ -78,12 +78,11 @@ def get_instance_whitelist(args, region): return instances -def upload_to_s3(args, region, instances): +def upload_to_s3(args, region, instances, key): s3_client = boto3.resource("s3", region_name=region) bucket = args.bucket if args.bucket else "%s-aws-parallelcluster" % region - key = "instances/batch_instances.json" if args.dryrun == "true": print(instances) @@ -106,13 +105,18 @@ def upload_to_s3(args, region, instances): def main(args): # For all regions for region in args.regions: - instances = get_instance_whitelist(args, region) - response = upload_to_s3(args, region, instances) + batch_instances = get_batch_instance_whitelist(args, region) + if args.efa: + efa_instances = args.efa.split(",") + instances = {"Features": {"efa": {"instances": efa_instances}, "awsbatch": {"instances": batch_instances}}} + upload_to_s3(args, region, instances, "features/feature_whitelist.json") + else: + upload_to_s3(args, region, batch_instances, "instances/batch_instances.json") if __name__ == "__main__": # parse inputs - parser = argparse.ArgumentParser(description="Generate a whitelist of batch instance types.") + parser = argparse.ArgumentParser(description="Generate a whitelist of instance types per region.") parser.add_argument("--partition", type=str, help="commercial | china | govcloud", required=True) parser.add_argument( "--regions", @@ -123,6 +127,7 @@ def main(args): parser.add_argument( "--bucket", type=str, help="Bucket to upload too, defaults to [region]-aws-parallelcluster", required=False ) + parser.add_argument("--efa", type=str, help="Comma separated list of instances supported by EFA", required=False) parser.add_argument("--dryrun", type=str, help="Doesn't push anything to S3, just outputs", required=True) args = parser.parse_args() diff --git a/util/uploadTemplate.sh b/util/uploadTemplate.sh old mode 100644 new mode 100755 index 234a204dbc..d5a6279c38 --- a/util/uploadTemplate.sh +++ b/util/uploadTemplate.sh @@ -95,7 +95,7 @@ main() { echo "" echo "Done. Add the following variables to the pcluster config file, under the [cluster ...] section" echo "template_url = https://s3${_bucket_region}.amazonaws.com/${_bucket}/template/aws-parallelcluster.cfn.${_version}.json" - echo "custom_awsbatch_template_url = https://s3${_bucket_region}.amazonaws.com/${_bucket}/template/batch.cfn.json" + echo "custom_awsbatch_template_url = https://s3${_bucket_region}.amazonaws.com/${_bucket}/template/batch-substack.cfn.json" } main "$@"