Skip to content

Commit

Permalink
Intel HPC Platform Spec Integration Tests
Browse files Browse the repository at this point in the history
Signed-off-by: Sean Smith <seaam@amazon.com>
  • Loading branch information
sean-smith committed Aug 29, 2019
1 parent 5fb0adf commit 0b12996
Show file tree
Hide file tree
Showing 11 changed files with 278 additions and 6 deletions.
2 changes: 1 addition & 1 deletion tests/integration-tests/remote_command_executor.py
Expand Up @@ -92,7 +92,7 @@ def run_remote_script(self, script_file, args=None, log_error=True, additional_f
:param script_file: local path to the script to execute remotely.
:param args: args to pass to the script when invoked.
:param log_error: log errors.
:param additional_files: additional files to copy before executing script.
:param additional_files: list of additional files (full path) to copy before executing script.
:param hide: do not print command output to the local stdout
:return: result of the execution.
"""
Expand Down
2 changes: 1 addition & 1 deletion tests/integration-tests/tests/common/utils.py
Expand Up @@ -16,7 +16,7 @@


@retry(stop_max_attempt_number=3, wait_fixed=5000)
def _fetch_instance_slots(region, instance_type):
def fetch_instance_slots(region, instance_type):
bucket_name = "{0}-aws-parallelcluster".format(region)
try:
s3 = boto3.resource("s3", region_name=region)
Expand Down
4 changes: 2 additions & 2 deletions tests/integration-tests/tests/efa/test_efa.py
Expand Up @@ -19,7 +19,7 @@
from tests.common.assertions import assert_no_errors_in_logs
from tests.common.mpi_common import _test_mpi
from tests.common.schedulers_common import get_scheduler_commands
from tests.common.utils import _fetch_instance_slots
from tests.common.utils import fetch_instance_slots


@pytest.mark.regions(["us-east-1"])
Expand All @@ -33,7 +33,7 @@ def test_efa(region, scheduler, instance, os, pcluster_config_reader, clusters_f
Grouped all tests in a single function so that cluster can be reused for all of them.
"""
max_queue_size = 2
slots_per_instance = _fetch_instance_slots(region, instance)
slots_per_instance = fetch_instance_slots(region, instance)
cluster_config = pcluster_config_reader(max_queue_size=max_queue_size)
cluster = clusters_factory(cluster_config)
remote_command_executor = RemoteCommandExecutor(cluster)
Expand Down
10 changes: 10 additions & 0 deletions tests/integration-tests/tests/intel_hpc/__init__.py
@@ -0,0 +1,10 @@
# Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance
# with the License. A copy of the License is located at
#
# http://aws.amazon.com/apache2.0/
#
# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and
# limitations under the License.
83 changes: 83 additions & 0 deletions tests/integration-tests/tests/intel_hpc/test_intel_hpc.py
@@ -0,0 +1,83 @@
# Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License").
# You may not use this file except in compliance with the License.
# A copy of the License is located at
#
# http://aws.amazon.com/apache2.0/
#
# or in the "LICENSE.txt" file accompanying this file.
# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied.
# See the License for the specific language governing permissions and limitations under the License.
import logging

import pytest

from assertpy import assert_that
from remote_command_executor import RemoteCommandExecutor
from tests.common.assertions import assert_no_errors_in_logs
from tests.common.schedulers_common import get_scheduler_commands
from tests.common.utils import fetch_instance_slots


@pytest.mark.regions(["us-east-1"])
@pytest.mark.instances(["c5n.18xlarge"])
@pytest.mark.oss(["centos7"])
@pytest.mark.schedulers(["sge"])
def test_intel_hpc(region, scheduler, instance, os, pcluster_config_reader, clusters_factory, test_datadir):
"""
Test Intel Cluster Checker
"""
slots_per_instance = fetch_instance_slots(region, instance)
cluster_config = pcluster_config_reader()
cluster = clusters_factory(cluster_config)
remote_command_executor = RemoteCommandExecutor(cluster)
scheduler_commands = get_scheduler_commands(scheduler, remote_command_executor)
_test_intel_clck(remote_command_executor, scheduler_commands, slots_per_instance, test_datadir)

assert_no_errors_in_logs(remote_command_executor, ["/var/log/sqswatcher", "/var/log/jobwatcher"])


def _test_intel_clck(remote_command_executor, scheduler_commands, slots_per_instance, test_datadir):
# Install Intel Cluster Checker CLCK Master
logging.info("Installing Intel Cluster Checker")
remote_command_executor.run_remote_script(str(test_datadir / "install_clck.sh"), hide=False)

# Install Intel Cluster Checker CLCK Compute
result = scheduler_commands.submit_script(
str(test_datadir / "install_clck_compute.sh"), slots=2 * slots_per_instance
)
job_id = scheduler_commands.assert_job_submitted(result.stdout)
scheduler_commands.wait_job_completed(job_id)
scheduler_commands.assert_job_succeeded(job_id)

# Create nodefile
# ip-172-31-15-31 # role: head
# ip-172-31-12-237 # role: compute
# ip-172-31-8-49 # role: compute
remote_command_executor.run_remote_command("echo $HOSTNAME | awk '{print $1 \" # role: head\" }' > nodefile")
remote_command_executor.run_remote_command(
"qhost | tail -n +4 | awk '{print $1 \" # role: compute\" }' >> nodefile"
)
result = remote_command_executor.run_remote_command("cat nodefile | wc -l")
assert_that(result.stdout).contains("3")

# Setup network interface
# <!-- This tag can be used to set the network interface used for
# accumulating data collected on-demand.
# -->
# <!--
# <network_interface>ens5</network_interface>
# -->
# /opt/intel/clck/2019.3.5/etc/clck.xml
remote_command_executor.run_remote_command(
"sudo cp ~/clck.xml /opt/intel/clck/2019.3.5/etc/clck.xml", additional_files=[str(test_datadir / "clck.xml")]
)

# Run Cluster Checker
result = remote_command_executor.run_remote_script(str(test_datadir / "run_clck.sh"))
try:
assert_that(result.stdout).contains("Overall Result: PASS")
except AssertionError as e:
logging.error(remote_command_executor.run_remote_command("cat clck_results.log"))
raise (e)
@@ -0,0 +1,133 @@
<?xml version="1.0" encoding="UTF-8"?>
<configuration>

<plugins>

<!-- Framework Definition configuration file -->
<framework_definitions>
</framework_definitions>

</plugins>

<analyzer>

<!-- Knowledge Base Configuration -->
<config>
</config>

<!-- DISPLAY OPTIONS -->

<!-- LOG LEVEL -->

<!-- This tag can be used to override the default log level. Valid
values, in increasing order of severity are debug, info,
notice, warning, error, critical, and alert. Only messages
that correspond to specified level and above are output. The
default log level is error.
-->
<!--
<log_level>error</log_level>
-->

<!-- SUPPRESSIONS -->

<!-- This tag can be used to suppress signs and diagnoses. See the User's
Guide for details.
-->
<!--
<suppressions>
</suppressions>
-->

</analyzer>

<postprocessor>
<!-- This tag can be used to override postprocessor extensions in the fwd -->
<!--
<postproc_extensions>
<group>
<entry>summary</entry>
<entry>clck_output_log</entry>
</group>
</postproc_extensions>
-->
</postprocessor>

<collector>
<!-- This tag can be used to run collector extensions such as the
mpi extention.
-->
<!--
<extension>mpi.so</extension>
-->

<!-- This tag can be used to set the network interface used for
accumulating data collected on-demand.
-->
<network_interface>ens5</network_interface>

<!-- This tag can be used to override the default location for
data provider helper files.
The string %PROVIDER_AUXILIARY_PATH% is replaced with the
value of this tag in the data provider XML configuration
files.
-->
<!--
<provider_auxiliary_path>/opt/intel/clck/2019.3.5/provider/share</provider_auxiliary_path>
-->

<!-- This can be used to override the default location for data
providers.
-->
<!--
<provider_config_dir>/opt/intel/clck/2019.3.5/provider/etc</provider_config_dir>
-->

<!-- This tag can be used to collect missing or old data. The default
is set to off. Valid values are 'on' or 'off'. -->
<!-- <re-collect>off</re-collect> -->


<!-- This tag can be used to override the global default minimum
data provider timeout threshold. Individual data providers
may set larger timeout values, but this global value
overrides any smaller value.
This parameter is the base value of time (in seconds)
multiplied by a scale factor. If this time is exceeded, the
data provider will be terminated to prevent it from hanging.
The scale attribute specifies the rate at which the timeout
value should increase based on the number of nodes. Valid
options are: constant, linear, squared, logarithmic.
The "constant" attribute value does not scale the timeout
with the number of nodes used.
The "linear" attribute value scales linearly with the number
of nodes (e.g. base * num_nodes).
The "squared" attribute value scales with the number of nodes
squared (e.g. base * num_nodes^2).
The "logarithmic" tag scales logarithmically with the number
of nodes (e.g. base * ln((e-1) + num_nodes)).
-->
<!--
<timeout scale="constant">60</timeout>
-->
</collector>

<datastore_extensions>
<!-- This tag sets the database implementation. sqlite3 & odbc are the only
supported backend at this time.
-->
<!--
<group path="datastore/intel64/">
<entry config_file="default_sqlite.xml">libsqlite.so</entry>
</group>
-->
</datastore_extensions>

</configuration>
@@ -0,0 +1,7 @@
#!/usr/bin/env bash
set -e

rpm --import https://yum.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS-2019.PUB
sudo yum-config-manager --add-repo https://yum.repos.intel.com/clck/2019/setup/intel-clck-2019.repo
sudo yum-config-manager --add-repo https://yum.repos.intel.com/clck-ext/2019/setup/intel-clck-ext-2019.repo
sudo yum -y install intel-clck-2019.3.5-025
@@ -0,0 +1,6 @@
#!/bin/bash
set -e

module load openmpi
chmod +x ${HOME}/install_clck.sh
mpirun --map-by ppr:1:node ${HOME}/install_clck.sh
@@ -0,0 +1,27 @@
[global]
cluster_template = default

[aws]
aws_region_name = {{ region }}

[cluster default]
base_os = {{ os }}
key_name = {{ key_name }}
vpc_settings = parallelcluster-vpc
scheduler = {{ scheduler }}
master_instance_type = {{ instance }}
compute_instance_type = {{ instance }}
initial_queue_size = 2
maintain_initial_size = true
master_root_volume_size = 80
compute_root_volume_size = 80
ebs_settings = large

[ebs large]
shared_dir = /shared
volume_size = 200

[vpc parallelcluster-vpc]
vpc_id = {{ vpc_id }}
master_subnet_id = {{ public_subnet_id }}
compute_subnet_id = {{ private_subnet_id }}
@@ -0,0 +1,6 @@
#!/bin/bash
set -e

source /opt/intel/clck/2019.3.5/bin/clckvars.sh
module load intelpsxe intelpython/2 intelpython/3
clck -f nodefile -F intel_hpc_platform_compat-hpc-2018.0
4 changes: 2 additions & 2 deletions tests/integration-tests/tests/scaling/test_mpi.py
Expand Up @@ -17,7 +17,7 @@
from remote_command_executor import RemoteCommandExecutionError, RemoteCommandExecutor
from tests.common.mpi_common import OS_TO_OPENMPI_MODULE_MAP, _test_mpi
from tests.common.schedulers_common import get_scheduler_commands
from tests.common.utils import _fetch_instance_slots
from tests.common.utils import fetch_instance_slots
from wrapt_timeout_decorator import timeout


Expand All @@ -28,7 +28,7 @@
def test_mpi(scheduler, region, os, instance, pcluster_config_reader, clusters_factory):
scaledown_idletime = 3
max_queue_size = 3
slots_per_instance = _fetch_instance_slots(region, instance)
slots_per_instance = fetch_instance_slots(region, instance)
cluster_config = pcluster_config_reader(scaledown_idletime=scaledown_idletime, max_queue_size=max_queue_size)
cluster = clusters_factory(cluster_config)
remote_command_executor = RemoteCommandExecutor(cluster)
Expand Down

0 comments on commit 0b12996

Please sign in to comment.