Intel HPC Platform Spec Integration Tests

Signed-off-by: Sean Smith <seaam@amazon.com>
aws · Aug 29, 2019 · 0b12996 · 0b12996
1 parent 5fb0adf
commit 0b12996
Show file tree

Hide file tree

Showing 11 changed files with 278 additions and 6 deletions.
diff --git a/tests/integration-tests/remote_command_executor.py b/tests/integration-tests/remote_command_executor.py
@@ -92,7 +92,7 @@ def run_remote_script(self, script_file, args=None, log_error=True, additional_f
         :param script_file: local path to the script to execute remotely.
         :param args: args to pass to the script when invoked.
         :param log_error: log errors.
-        :param additional_files: additional files to copy before executing script.
+        :param additional_files: list of additional files (full path) to copy before executing script.
         :param hide: do not print command output to the local stdout
         :return: result of the execution.
         """

diff --git a/tests/integration-tests/tests/common/utils.py b/tests/integration-tests/tests/common/utils.py
@@ -16,7 +16,7 @@
 
 
 @retry(stop_max_attempt_number=3, wait_fixed=5000)
-def _fetch_instance_slots(region, instance_type):
+def fetch_instance_slots(region, instance_type):
     bucket_name = "{0}-aws-parallelcluster".format(region)
     try:
         s3 = boto3.resource("s3", region_name=region)

diff --git a/tests/integration-tests/tests/efa/test_efa.py b/tests/integration-tests/tests/efa/test_efa.py
@@ -19,7 +19,7 @@
 from tests.common.assertions import assert_no_errors_in_logs
 from tests.common.mpi_common import _test_mpi
 from tests.common.schedulers_common import get_scheduler_commands
-from tests.common.utils import _fetch_instance_slots
+from tests.common.utils import fetch_instance_slots
 
 
 @pytest.mark.regions(["us-east-1"])
@@ -33,7 +33,7 @@ def test_efa(region, scheduler, instance, os, pcluster_config_reader, clusters_f
     Grouped all tests in a single function so that cluster can be reused for all of them.
     """
     max_queue_size = 2
-    slots_per_instance = _fetch_instance_slots(region, instance)
+    slots_per_instance = fetch_instance_slots(region, instance)
     cluster_config = pcluster_config_reader(max_queue_size=max_queue_size)
     cluster = clusters_factory(cluster_config)
     remote_command_executor = RemoteCommandExecutor(cluster)

diff --git a/tests/integration-tests/tests/intel_hpc/__init__.py b/tests/integration-tests/tests/intel_hpc/__init__.py
@@ -0,0 +1,10 @@
+# Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance
+# with the License. A copy of the License is located at
+#
+# http://aws.amazon.com/apache2.0/
+#
+# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
+# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/tests/integration-tests/tests/intel_hpc/test_intel_hpc.py b/tests/integration-tests/tests/intel_hpc/test_intel_hpc.py
@@ -0,0 +1,83 @@
+# Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License").
+# You may not use this file except in compliance with the License.
+# A copy of the License is located at
+#
+# http://aws.amazon.com/apache2.0/
+#
+# or in the "LICENSE.txt" file accompanying this file.
+# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied.
+# See the License for the specific language governing permissions and limitations under the License.
+import logging
+
+import pytest
+
+from assertpy import assert_that
+from remote_command_executor import RemoteCommandExecutor
+from tests.common.assertions import assert_no_errors_in_logs
+from tests.common.schedulers_common import get_scheduler_commands
+from tests.common.utils import fetch_instance_slots
+
+
+@pytest.mark.regions(["us-east-1"])
+@pytest.mark.instances(["c5n.18xlarge"])
+@pytest.mark.oss(["centos7"])
+@pytest.mark.schedulers(["sge"])
+def test_intel_hpc(region, scheduler, instance, os, pcluster_config_reader, clusters_factory, test_datadir):
+    """
+    Test Intel Cluster Checker
+    """
+    slots_per_instance = fetch_instance_slots(region, instance)
+    cluster_config = pcluster_config_reader()
+    cluster = clusters_factory(cluster_config)
+    remote_command_executor = RemoteCommandExecutor(cluster)
+    scheduler_commands = get_scheduler_commands(scheduler, remote_command_executor)
+    _test_intel_clck(remote_command_executor, scheduler_commands, slots_per_instance, test_datadir)
+
+    assert_no_errors_in_logs(remote_command_executor, ["/var/log/sqswatcher", "/var/log/jobwatcher"])
+
+
+def _test_intel_clck(remote_command_executor, scheduler_commands, slots_per_instance, test_datadir):
+    # Install Intel Cluster Checker CLCK Master
+    logging.info("Installing Intel Cluster Checker")
+    remote_command_executor.run_remote_script(str(test_datadir / "install_clck.sh"), hide=False)
+
+    # Install Intel Cluster Checker CLCK Compute
+    result = scheduler_commands.submit_script(
+        str(test_datadir / "install_clck_compute.sh"), slots=2 * slots_per_instance
+    )
+    job_id = scheduler_commands.assert_job_submitted(result.stdout)
+    scheduler_commands.wait_job_completed(job_id)
+    scheduler_commands.assert_job_succeeded(job_id)
+
+    # Create nodefile
+    # ip-172-31-15-31  # role: head
+    # ip-172-31-12-237  # role: compute
+    # ip-172-31-8-49  # role: compute
+    remote_command_executor.run_remote_command("echo $HOSTNAME | awk '{print $1 \" # role: head\" }' > nodefile")
+    remote_command_executor.run_remote_command(
+        "qhost | tail -n +4 | awk '{print $1 \" # role: compute\" }' >> nodefile"
+    )
+    result = remote_command_executor.run_remote_command("cat nodefile | wc -l")
+    assert_that(result.stdout).contains("3")
+
+    # Setup network interface
+    # <!-- This tag can be used to set the network interface used for
+    #      accumulating data collected on-demand.
+    # -->
+    # <!--
+    # <network_interface>ens5</network_interface>
+    # -->
+    # /opt/intel/clck/2019.3.5/etc/clck.xml
+    remote_command_executor.run_remote_command(
+        "sudo cp ~/clck.xml /opt/intel/clck/2019.3.5/etc/clck.xml", additional_files=[str(test_datadir / "clck.xml")]
+    )
+
+    # Run Cluster Checker
+    result = remote_command_executor.run_remote_script(str(test_datadir / "run_clck.sh"))
+    try:
+        assert_that(result.stdout).contains("Overall Result: PASS")
+    except AssertionError as e:
+        logging.error(remote_command_executor.run_remote_command("cat clck_results.log"))
+        raise (e)
diff --git a/tests/integration-tests/tests/intel_hpc/test_intel_hpc/test_intel_hpc/clck.xml b/tests/integration-tests/tests/intel_hpc/test_intel_hpc/test_intel_hpc/clck.xml
@@ -0,0 +1,133 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<configuration>
+
+  <plugins>
+
+    <!-- Framework Definition configuration file -->
+    <framework_definitions>
+    </framework_definitions>
+
+  </plugins>
+
+  <analyzer>
+
+    <!-- Knowledge Base Configuration -->
+    <config>
+    </config>
+
+    <!-- DISPLAY OPTIONS -->
+
+    <!-- LOG LEVEL -->
+
+    <!-- This tag can be used to override the default log level.  Valid
+         values, in increasing order of severity are debug, info,
+         notice, warning, error, critical, and alert.  Only messages
+         that correspond to specified level and above are output.  The
+         default log level is error.
+    -->
+    <!--
+    <log_level>error</log_level>
+    -->
+
+    <!-- SUPPRESSIONS -->
+
+    <!-- This tag can be used to suppress signs and diagnoses.  See the User's
+         Guide for details.
+    -->
+    <!--
+    <suppressions>
+    </suppressions>
+    -->
+
+  </analyzer>
+
+  <postprocessor>
+    <!-- This tag can be used to override postprocessor extensions in the fwd -->
+    <!--
+       <postproc_extensions>
+         <group>
+           <entry>summary</entry>
+           <entry>clck_output_log</entry>
+         </group>
+       </postproc_extensions>
+       -->
+  </postprocessor>
+
+  <collector>
+    <!-- This tag can be used to run collector extensions such as the
+         mpi extention.
+    -->
+    <!--
+    <extension>mpi.so</extension>
+    -->
+
+    <!-- This tag can be used to set the network interface used for
+         accumulating data collected on-demand.
+    -->
+    <network_interface>ens5</network_interface>
+
+    <!-- This tag can be used to override the default location for
+         data provider helper files.
+
+         The string %PROVIDER_AUXILIARY_PATH% is replaced with the
+         value of this tag in the data provider XML configuration
+         files.
+    -->
+    <!--
+    <provider_auxiliary_path>/opt/intel/clck/2019.3.5/provider/share</provider_auxiliary_path>
+    -->
+
+    <!-- This can be used to override the default location for data
+         providers.
+    -->
+    <!--
+    <provider_config_dir>/opt/intel/clck/2019.3.5/provider/etc</provider_config_dir>
+    -->
+
+    <!-- This tag can be used to collect missing or old data. The default
+         is set to off. Valid values are 'on' or 'off'. -->
+    <!-- <re-collect>off</re-collect> -->
+
+
+    <!-- This tag can be used to override the global default minimum
+         data provider timeout threshold.  Individual data providers
+         may set larger timeout values, but this global value
+         overrides any smaller value.
+
+         This parameter is the base value of time (in seconds)
+         multiplied by a scale factor.  If this time is exceeded, the
+         data provider will be terminated to prevent it from hanging.
+
+         The scale attribute specifies the rate at which the timeout
+         value should increase based on the number of nodes.  Valid
+         options are: constant, linear, squared, logarithmic.
+
+         The "constant" attribute value does not scale the timeout
+         with the number of nodes used.
+
+         The "linear" attribute value scales linearly with the number
+         of nodes (e.g. base * num_nodes).
+
+         The "squared" attribute value scales with the number of nodes
+         squared (e.g. base * num_nodes^2).
+
+         The "logarithmic" tag scales logarithmically with the number
+         of nodes (e.g. base * ln((e-1) + num_nodes)).
+    -->
+    <!--
+    <timeout scale="constant">60</timeout>
+    -->
+  </collector>
+
+  <datastore_extensions>
+    <!-- This tag sets the database implementation. sqlite3 & odbc are the only
+         supported backend at this time.
+    -->
+    <!--
+    <group path="datastore/intel64/">
+        <entry config_file="default_sqlite.xml">libsqlite.so</entry>
+    </group>
+    -->
+  </datastore_extensions>
+
+</configuration>
diff --git a/tests/integration-tests/tests/intel_hpc/test_intel_hpc/test_intel_hpc/install_clck.sh b/tests/integration-tests/tests/intel_hpc/test_intel_hpc/test_intel_hpc/install_clck.sh
@@ -0,0 +1,7 @@
+#!/usr/bin/env bash
+set -e
+
+rpm --import https://yum.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS-2019.PUB
+sudo yum-config-manager --add-repo https://yum.repos.intel.com/clck/2019/setup/intel-clck-2019.repo
+sudo yum-config-manager --add-repo https://yum.repos.intel.com/clck-ext/2019/setup/intel-clck-ext-2019.repo
+sudo yum -y install intel-clck-2019.3.5-025
diff --git a/...s/integration-tests/tests/intel_hpc/test_intel_hpc/test_intel_hpc/install_clck_compute.sh b/...s/integration-tests/tests/intel_hpc/test_intel_hpc/test_intel_hpc/install_clck_compute.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+set -e
+
+module load openmpi
+chmod +x ${HOME}/install_clck.sh
+mpirun --map-by ppr:1:node ${HOME}/install_clck.sh
diff --git a/tests/integration-tests/tests/intel_hpc/test_intel_hpc/test_intel_hpc/pcluster.config.ini b/tests/integration-tests/tests/intel_hpc/test_intel_hpc/test_intel_hpc/pcluster.config.ini
@@ -0,0 +1,27 @@
+[global]
+cluster_template = default
+
+[aws]
+aws_region_name = {{ region }}
+
+[cluster default]
+base_os = {{ os }}
+key_name = {{ key_name }}
+vpc_settings = parallelcluster-vpc
+scheduler = {{ scheduler }}
+master_instance_type = {{ instance }}
+compute_instance_type = {{ instance }}
+initial_queue_size = 2
+maintain_initial_size = true
+master_root_volume_size = 80
+compute_root_volume_size = 80
+ebs_settings = large
+
+[ebs large]
+shared_dir = /shared
+volume_size = 200
+
+[vpc parallelcluster-vpc]
+vpc_id = {{ vpc_id }}
+master_subnet_id = {{ public_subnet_id }}
+compute_subnet_id = {{ private_subnet_id }}
diff --git a/tests/integration-tests/tests/intel_hpc/test_intel_hpc/test_intel_hpc/run_clck.sh b/tests/integration-tests/tests/intel_hpc/test_intel_hpc/test_intel_hpc/run_clck.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+set -e
+
+source /opt/intel/clck/2019.3.5/bin/clckvars.sh
+module load intelpsxe intelpython/2 intelpython/3
+clck -f nodefile -F intel_hpc_platform_compat-hpc-2018.0
diff --git a/tests/integration-tests/tests/scaling/test_mpi.py b/tests/integration-tests/tests/scaling/test_mpi.py
@@ -17,7 +17,7 @@
 from remote_command_executor import RemoteCommandExecutionError, RemoteCommandExecutor
 from tests.common.mpi_common import OS_TO_OPENMPI_MODULE_MAP, _test_mpi
 from tests.common.schedulers_common import get_scheduler_commands
-from tests.common.utils import _fetch_instance_slots
+from tests.common.utils import fetch_instance_slots
 from wrapt_timeout_decorator import timeout
 
 
@@ -28,7 +28,7 @@
 def test_mpi(scheduler, region, os, instance, pcluster_config_reader, clusters_factory):
     scaledown_idletime = 3
     max_queue_size = 3
-    slots_per_instance = _fetch_instance_slots(region, instance)
+    slots_per_instance = fetch_instance_slots(region, instance)
     cluster_config = pcluster_config_reader(scaledown_idletime=scaledown_idletime, max_queue_size=max_queue_size)
     cluster = clusters_factory(cluster_config)
     remote_command_executor = RemoteCommandExecutor(cluster)