From 43467475d889005309acb528011fd0b5e98db05c Mon Sep 17 00:00:00 2001 From: Ballantyne Date: Tue, 18 Nov 2014 05:50:13 -0600 Subject: [PATCH 01/13] Moving nodewatcher and sqswatcher into a python package. --- node/nodewatcher/__init__.py | 10 ++++ node/{src => }/nodewatcher/nodewatcher.cfg | 0 node/{src => }/nodewatcher/nodewatcher.py | 7 ++- .../{src => }/nodewatcher/plugins/__init__.py | 0 .../{src => }/nodewatcher/plugins/openlava.py | 0 node/{src => }/nodewatcher/plugins/sge.py | 0 node/{src => }/nodewatcher/plugins/test.py | 0 node/{src => }/nodewatcher/plugins/torque.py | 0 node/setup.py | 56 +++++++++++++++++++ node/sqswatcher/__init__.py | 10 ++++ node/{src => }/sqswatcher/plugins/__init__.py | 0 node/{src => }/sqswatcher/plugins/openlava.py | 0 node/{src => }/sqswatcher/plugins/sge.py | 0 node/{src => }/sqswatcher/plugins/test.py | 0 node/{src => }/sqswatcher/plugins/torque.py | 0 node/{src => }/sqswatcher/sqswatcher.cfg | 0 node/{src => }/sqswatcher/sqswatcher.py | 9 ++- 17 files changed, 87 insertions(+), 5 deletions(-) create mode 100644 node/nodewatcher/__init__.py rename node/{src => }/nodewatcher/nodewatcher.cfg (100%) rename node/{src => }/nodewatcher/nodewatcher.py (98%) rename node/{src => }/nodewatcher/plugins/__init__.py (100%) rename node/{src => }/nodewatcher/plugins/openlava.py (100%) rename node/{src => }/nodewatcher/plugins/sge.py (100%) rename node/{src => }/nodewatcher/plugins/test.py (100%) rename node/{src => }/nodewatcher/plugins/torque.py (100%) create mode 100644 node/setup.py create mode 100644 node/sqswatcher/__init__.py rename node/{src => }/sqswatcher/plugins/__init__.py (100%) rename node/{src => }/sqswatcher/plugins/openlava.py (100%) rename node/{src => }/sqswatcher/plugins/sge.py (100%) rename node/{src => }/sqswatcher/plugins/test.py (100%) rename node/{src => }/sqswatcher/plugins/torque.py (100%) rename node/{src => }/sqswatcher/sqswatcher.cfg (100%) rename node/{src => }/sqswatcher/sqswatcher.py (98%) diff --git a/node/nodewatcher/__init__.py b/node/nodewatcher/__init__.py new file mode 100644 index 0000000000..717a83af1e --- /dev/null +++ b/node/nodewatcher/__init__.py @@ -0,0 +1,10 @@ +# Copyright 2013-2014 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Amazon Software License (the "License"). You may not use this file except in compliance with the +# License. A copy of the License is located at +# +# http://aws.amazon.com/asl/ +# +# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES +# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and +# limitations under the License. diff --git a/node/src/nodewatcher/nodewatcher.cfg b/node/nodewatcher/nodewatcher.cfg similarity index 100% rename from node/src/nodewatcher/nodewatcher.cfg rename to node/nodewatcher/nodewatcher.cfg diff --git a/node/src/nodewatcher/nodewatcher.py b/node/nodewatcher/nodewatcher.py similarity index 98% rename from node/src/nodewatcher/nodewatcher.py rename to node/nodewatcher/nodewatcher.py index 848066f586..36c3abc144 100755 --- a/node/src/nodewatcher/nodewatcher.py +++ b/node/nodewatcher/nodewatcher.py @@ -28,7 +28,7 @@ def getConfig(instance_id): print('running getConfig') config = ConfigParser.RawConfigParser() - config.read('nodewatcher.cfg') + config.read('/etc/nodewatcher.cfg') _region = config.get('nodewatcher', 'region') _scheduler = config.get('nodewatcher', 'scheduler') try: @@ -103,7 +103,7 @@ def selfTerminate(asg): if _capacity > 0: _as_conn.terminate_instance(instance_id, decrement_capacity=True) -if __name__ == "__main__": +def main(): print('Running __main__') instance_id = getInstanceId() hostname = getHostname() @@ -125,3 +125,6 @@ def selfTerminate(asg): if hour_percentile > 95: selfTerminate(asg) + +if __name__ == "__main__": + main() diff --git a/node/src/nodewatcher/plugins/__init__.py b/node/nodewatcher/plugins/__init__.py similarity index 100% rename from node/src/nodewatcher/plugins/__init__.py rename to node/nodewatcher/plugins/__init__.py diff --git a/node/src/nodewatcher/plugins/openlava.py b/node/nodewatcher/plugins/openlava.py similarity index 100% rename from node/src/nodewatcher/plugins/openlava.py rename to node/nodewatcher/plugins/openlava.py diff --git a/node/src/nodewatcher/plugins/sge.py b/node/nodewatcher/plugins/sge.py similarity index 100% rename from node/src/nodewatcher/plugins/sge.py rename to node/nodewatcher/plugins/sge.py diff --git a/node/src/nodewatcher/plugins/test.py b/node/nodewatcher/plugins/test.py similarity index 100% rename from node/src/nodewatcher/plugins/test.py rename to node/nodewatcher/plugins/test.py diff --git a/node/src/nodewatcher/plugins/torque.py b/node/nodewatcher/plugins/torque.py similarity index 100% rename from node/src/nodewatcher/plugins/torque.py rename to node/nodewatcher/plugins/torque.py diff --git a/node/setup.py b/node/setup.py new file mode 100644 index 0000000000..2f6be2a138 --- /dev/null +++ b/node/setup.py @@ -0,0 +1,56 @@ +# Copyright 2013-2014 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Amazon Software License (the "License"). You may not use this file except in compliance with the +# License. A copy of the License is located at +# +# http://aws.amazon.com/asl/ +# +# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES +# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and +# limitations under the License. + +import os, sys +from setuptools import setup, find_packages + +# Utility function to read the README file. +# Used for the long_description. It's nice, because now 1) we have a top level +# README file and 2) it's easier to type in the README file than to put a raw +# string in below ... +def read(fname): + return open(os.path.join(os.path.dirname(__file__), fname)).read() + +console_scripts = ['sqswatcher = sqswatcher.sqswatcher:main', + 'nodewatcher = nodewatcher.nodewatcher:main'] +version = "0.0.1" +requires = ['boto>=2.34', 'paramiko', 'python-dateutil'] + +if sys.version_info[:2] == (2, 6): + # For python2.6 we have to require argparse since it + # was not in stdlib until 2.7. + requires.append('argparse>=1.1') + +setup( + name = "cfncluster-node", + version = version, + author = "Dougal Ballantyne", + author_email = "dougalb@amazon.com", + description = ("cfncluster-node provides the scripts for a cfncluster node."), + url = ("https://github.com/awslabs/cfncluster"), + license = "Amazon Software License", + packages = find_packages(), + install_requires = requires, + entry_points=dict(console_scripts=console_scripts), + include_package_data = True, + zip_safe = False, + package_data = { + '' : ['examples/config'], + }, + long_description=read('README'), + classifiers=[ + "Development Status :: 4 - Beta", + "Environment :: Console", + "Programming Language :: Python", + "Topic :: Scientific/Engineering", + "License :: Other/Proprietary License", + ], +) diff --git a/node/sqswatcher/__init__.py b/node/sqswatcher/__init__.py new file mode 100644 index 0000000000..717a83af1e --- /dev/null +++ b/node/sqswatcher/__init__.py @@ -0,0 +1,10 @@ +# Copyright 2013-2014 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Amazon Software License (the "License"). You may not use this file except in compliance with the +# License. A copy of the License is located at +# +# http://aws.amazon.com/asl/ +# +# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES +# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and +# limitations under the License. diff --git a/node/src/sqswatcher/plugins/__init__.py b/node/sqswatcher/plugins/__init__.py similarity index 100% rename from node/src/sqswatcher/plugins/__init__.py rename to node/sqswatcher/plugins/__init__.py diff --git a/node/src/sqswatcher/plugins/openlava.py b/node/sqswatcher/plugins/openlava.py similarity index 100% rename from node/src/sqswatcher/plugins/openlava.py rename to node/sqswatcher/plugins/openlava.py diff --git a/node/src/sqswatcher/plugins/sge.py b/node/sqswatcher/plugins/sge.py similarity index 100% rename from node/src/sqswatcher/plugins/sge.py rename to node/sqswatcher/plugins/sge.py diff --git a/node/src/sqswatcher/plugins/test.py b/node/sqswatcher/plugins/test.py similarity index 100% rename from node/src/sqswatcher/plugins/test.py rename to node/sqswatcher/plugins/test.py diff --git a/node/src/sqswatcher/plugins/torque.py b/node/sqswatcher/plugins/torque.py similarity index 100% rename from node/src/sqswatcher/plugins/torque.py rename to node/sqswatcher/plugins/torque.py diff --git a/node/src/sqswatcher/sqswatcher.cfg b/node/sqswatcher/sqswatcher.cfg similarity index 100% rename from node/src/sqswatcher/sqswatcher.cfg rename to node/sqswatcher/sqswatcher.cfg diff --git a/node/src/sqswatcher/sqswatcher.py b/node/sqswatcher/sqswatcher.py similarity index 98% rename from node/src/sqswatcher/sqswatcher.py rename to node/sqswatcher/sqswatcher.py index c4c0815ea5..e3e0ce6b79 100755 --- a/node/src/sqswatcher/sqswatcher.py +++ b/node/sqswatcher/sqswatcher.py @@ -32,7 +32,7 @@ def getConfig(): print('running getConfig') config = ConfigParser.RawConfigParser() - config.read('sqswatcher.cfg') + config.read('/etc/sqswatcher.cfg') _region = config.get('sqswatcher', 'region') _sqsqueue = config.get('sqswatcher', 'sqsqueue') _table_name = config.get('sqswatcher', 'table_name') @@ -151,10 +151,13 @@ def pollQueue(): time.sleep(30) -if __name__ == "__main__": +def main(): print('running __main__') print time.ctime() region, sqsqueue, table_name, scheduler, cluster_user = getConfig() q = setupQueue(region, sqsqueue) t = setupDDBTable(region, table_name) - pollQueue() \ No newline at end of file + pollQueue() + +if __name__ == "__main__": + main() From 47f1958737608a167ceaab52c72075372a3e160a Mon Sep 17 00:00:00 2001 From: Dougal Ballantyne Date: Tue, 18 Nov 2014 06:08:52 -0600 Subject: [PATCH 02/13] More generic ignore for egg folders --- .gitignore | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.gitignore b/.gitignore index 7e14c4f044..df31d307d3 100644 --- a/.gitignore +++ b/.gitignore @@ -5,5 +5,5 @@ *# dist/ build/ -cfncluster.egg-info/ -.idea/ \ No newline at end of file +*.egg-info/ +.idea/ From f76cfeeb55d6ae7e05798fcd25e205ed957ae7a0 Mon Sep 17 00:00:00 2001 From: Dougal Ballantyne Date: Wed, 3 Dec 2014 14:23:15 -0800 Subject: [PATCH 03/13] Correcting proxy args --- cloudformation/cfncluster.cfn.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cloudformation/cfncluster.cfn.json b/cloudformation/cfncluster.cfn.json index 7bce8bfdee..ee7f488827 100644 --- a/cloudformation/cfncluster.cfn.json +++ b/cloudformation/cfncluster.cfn.json @@ -1121,7 +1121,7 @@ }, "\n", "if [ \"$proxy\" != \"NONE\" ]; then\n", - " proxy_args=\"--http-proxy=${proxy} --https-proxy=$proxy\"\n", + " proxy_args=\"--http-proxy=${proxy}\"\n", "else\n", " proxy_args=\"\"\n", "fi\n", @@ -1568,7 +1568,7 @@ }, "\n", "if [ \"$proxy\" != \"NONE\" ]; then\n", - " proxy_args=\"--http-proxy=${proxy} --https-proxy=$proxy\"\n", + " proxy_args=\"--http-proxy=${proxy}\"\n", "else\n", " proxy_args=\"\"\n", "fi\n", From cf9199500bff303cf782990f1c04b02de640f436 Mon Sep 17 00:00:00 2001 From: Dougal Ballantyne Date: Fri, 5 Dec 2014 09:58:23 -0800 Subject: [PATCH 04/13] Adding proxy support using boto.config --- node/nodewatcher/nodewatcher.py | 6 ++++-- node/sqswatcher/sqswatcher.py | 12 ++++++++---- 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/node/nodewatcher/nodewatcher.py b/node/nodewatcher/nodewatcher.py index 36c3abc144..cde72e8ad2 100755 --- a/node/nodewatcher/nodewatcher.py +++ b/node/nodewatcher/nodewatcher.py @@ -34,7 +34,8 @@ def getConfig(instance_id): try: _asg = config.get('nodewatcher', 'asg') except ConfigParser.NoOptionError: - conn = boto.ec2.connect_to_region(_region) + conn = boto.ec2.connect_to_region(_region,proxy=boto.config.get('Boto', 'proxy'), + proxy_port=boto.config.get('Boto', 'proxy_port')) _asg = conn.get_all_instances(instance_ids=instance_id)[0].instances[0].tags['aws:autoscaling:groupName'] config.set('nodewatcher', 'asg', _asg) @@ -97,7 +98,8 @@ def getJobs(s,hostname): def selfTerminate(asg): - _as_conn = boto.ec2.autoscale.connect_to_region(region) + _as_conn = boto.ec2.autoscale.connect_to_region(region,proxy=boto.config.get('Boto', 'proxy'), + proxy_port=boto.config.get('Boto', 'proxy_port')) _asg = _as_conn.get_all_groups(names=[asg])[0] _capacity = _asg.desired_capacity if _capacity > 0: diff --git a/node/sqswatcher/sqswatcher.py b/node/sqswatcher/sqswatcher.py index e3e0ce6b79..db765e0ec3 100755 --- a/node/sqswatcher/sqswatcher.py +++ b/node/sqswatcher/sqswatcher.py @@ -45,7 +45,8 @@ def getConfig(): def setupQueue(region, sqsqueue): print('running setupQueue') - conn = boto.sqs.connect_to_region(region) + conn = boto.sqs.connect_to_region(region,proxy=boto.config.get('Boto', 'proxy'), + proxy_port=boto.config.get('Boto', 'proxy_port')) _q = conn.get_queue(sqsqueue) if _q != None: @@ -56,10 +57,12 @@ def setupQueue(region, sqsqueue): def setupDDBTable(region, table_name): print('running setupDDBTable') - conn = boto.dynamodb.connect_to_region(region) + conn = boto.dynamodb.connect_to_region(region,proxy=boto.config.get('Boto', 'proxy'), + proxy_port=boto.config.get('Boto', 'proxy_port')) tables = conn.list_tables() check = [t for t in tables if t == table_name] - conn = boto.dynamodb2.connect_to_region(region) + conn = boto.dynamodb2.connect_to_region(region,proxy=boto.config.get('Boto', 'proxy'), + proxy_port=boto.config.get('Boto', 'proxy_port')) if check: _table = Table(table_name,connection=conn) else: @@ -105,7 +108,8 @@ def pollQueue(): print eventType, instanceId ec2 = boto.connect_ec2() - ec2 = boto.ec2.connect_to_region(region) + ec2 = boto.ec2.connect_to_region(region,proxy=boto.config.get('Boto', 'proxy'), + proxy_port=boto.config.get('Boto', 'proxy_port')) retry = 0 wait = 15 From ff25fdb3fb6eb4dbee5c8dc973fbfac2268a28ea Mon Sep 17 00:00:00 2001 From: Dougal Ballantyne Date: Mon, 15 Dec 2014 11:36:02 -0800 Subject: [PATCH 05/13] Adding notes on autoscaling within cfncluster --- docs/staging/autoscaling.txt | 10 ++++++++++ 1 file changed, 10 insertions(+) create mode 100644 docs/staging/autoscaling.txt diff --git a/docs/staging/autoscaling.txt b/docs/staging/autoscaling.txt new file mode 100644 index 0000000000..9d00951b7e --- /dev/null +++ b/docs/staging/autoscaling.txt @@ -0,0 +1,10 @@ +Clusters deployed with cfncluster are elastic in several ways. The first is by simply setting the initial_queue_size and max_queue_size parameters of a cluster settings. The initial_queue_size sets minimum size value of the ComputeFleet Auto Scaling Group(ASG) and also the desired capacity value . The max_queue_size sets maximum size value of the ComputeFleet ASG. As part of the cfncluster, two Amazon CloudWatch alarms are created. These alarms monitor a custom Amazon CloudWatch metric[1] that is published by the MasterServer of each cluster, this is the second elastic nature of cfncluster. This metric is called pending and is created per Stack and unique to each cluster. These Amazon CloudWatch alarms call ScaleUp policies associated with the ComputeFleet ASG. This is what handles the automatic addition of compute nodes when there is pending tasks in the cluster. It is actually capable to scaling the cluster with zero compute nodes until the alarms no longer trigger or the max_queue_size is reached. + +Within AutoScaling, there is typically a Amazon CloudWatch alarm to remove instances when no longer needed. This alarm would operate on a aggregate metric such as CPU or network. When the aggregate metric fell below a certain level, it would make a call to a ScaleDown policy. The decision to remove which instance is complex[2] and is not aware of individual instance utilization. For that reason, each one of the instances in the ComputeFleet ASG run a process called nodewatcher[3]. The purpose of this process is to monitor the instance and if idle AND close to the end of the current hour, remove it from the ComputeFleet ASG. It specifically calls the TerminateInstanceInAutoScalingGroup[4] API call, which will remove an instance as long as the size of the ASG is larger than the desired capacity. That is what handles the scale down of the cluster, without affecting any running jobs and also enables an elastic cluster with a fixed base number of instances. + +The value of the auto scaling is the same for HPC as with any other workloads, the only difference here is cfncluster has code to specifically make it interact in a more intelligent manner. If a static cluster is required, this can be achieved by setting initial_queue_size and max_queue_size parameters to the size of cluster required and also setting the maintain_initial_size parameter to true. This will cause the ComputeFleet ASG to have the same value for minimum, maximum and desired capacity. + +[1] http://docs.aws.amazon.com/AmazonCloudWatch/latest/DeveloperGuide/publishingMetrics.html +[2] http://docs.aws.amazon.com/AutoScaling/latest/DeveloperGuide/AutoScalingBehavior.InstanceTermination.html +[3] https://github.com/awslabs/cfncluster/tree/master/node/src/nodewatcher +[4] http://docs.aws.amazon.com/AutoScaling/latest/APIReference/API_TerminateInstanceInAutoScalingGroup.html \ No newline at end of file From af81609f47fd3520d713f92f71e2b583a25cc4e5 Mon Sep 17 00:00:00 2001 From: Dougal Ballantyne Date: Tue, 16 Dec 2014 13:53:30 -0800 Subject: [PATCH 06/13] Set dev build number; support DELETE_FAILED for status --- cli/cfncluster/cfncluster.py | 9 +++++---- cli/setup.py | 2 +- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/cli/cfncluster/cfncluster.py b/cli/cfncluster/cfncluster.py index ae10d6527a..dfe2f08158 100644 --- a/cli/cfncluster/cfncluster.py +++ b/cli/cfncluster/cfncluster.py @@ -261,7 +261,7 @@ def status(args): sys.stdout.flush() if not args.nowait: while ((status != 'CREATE_COMPLETE') and (status != 'UPDATE_COMPLETE') - and (status != 'ROLLBACK_COMPLETE') and (status != 'CREATE_FAILED')): + and (status != 'ROLLBACK_COMPLETE') and (status != 'CREATE_FAILED') and (status != 'DELETE_FAILED')): time.sleep(5) status = cfnconn.describe_stacks(stack)[0].stack_status events = cfnconn.describe_stack_events(stack)[0] @@ -274,11 +274,12 @@ def status(args): outputs = cfnconn.describe_stacks(stack)[0].outputs for output in outputs: print output - elif ((status == 'ROLLBACK_COMPLETE') or (status == 'CREATE_FAILED')): + elif ((status == 'ROLLBACK_COMPLETE') or (status == 'CREATE_FAILED') or (status == 'DELETE_FAILED')): events = cfnconn.describe_stack_events(stack) for event in events: - if event.resource_status == 'CREATE_FAILED': - print event.timestamp, event.resource_status, event.resource_type, event.logical_resource_id, event.resource_status_reason + if ((event.resource_status == 'CREATE_FAILED') or (event.resource_status == 'DELETE_FAILED')): + print event.timestamp, event.resource_status, event.resource_type, event.logical_resource_id, \ + event.resource_status_reason else: sys.stdout.write('\n') sys.stdout.flush() diff --git a/cli/setup.py b/cli/setup.py index 5d5e9f1cf9..f481403ec3 100644 --- a/cli/setup.py +++ b/cli/setup.py @@ -20,7 +20,7 @@ def read(fname): return open(os.path.join(os.path.dirname(__file__), fname)).read() console_scripts = ['cfncluster = cfncluster.cli:main'] -version = "0.0.16" +version = "0.0.99" requires = ['boto>=2.34'] if sys.version_info[:2] == (2, 6): From f5a31e9aee6ee47e3a68d87a878477de502b34d5 Mon Sep 17 00:00:00 2001 From: Dougal Ballantyne Date: Sun, 21 Dec 2014 20:17:53 +0000 Subject: [PATCH 07/13] Changes to DNS checks to allow preinstall actions that modify DNS. --- bootstrap/src/scripts/boot_as_compute | 4 ++-- bootstrap/src/scripts/boot_as_master | 4 ++-- bootstrap/src/scripts/functions.shlib | 19 ++++++++++++++++++- bootstrap/src/scripts/sge/boot_as_compute | 7 +++++++ bootstrap/src/scripts/sge/boot_as_master | 6 ++++++ 5 files changed, 35 insertions(+), 5 deletions(-) diff --git a/bootstrap/src/scripts/boot_as_compute b/bootstrap/src/scripts/boot_as_compute index 9532053f16..56164382d4 100755 --- a/bootstrap/src/scripts/boot_as_compute +++ b/bootstrap/src/scripts/boot_as_compute @@ -19,8 +19,8 @@ set -x # Source functions . /opt/cfncluster/scripts/functions.shlib -# Check DNS -check_dns || error_exit "DNS checks failed." +# Check basic DNS +check_dns_ns || error_exit "Basic DNS checks failed." # Run preinstall script if defined run_preinstall diff --git a/bootstrap/src/scripts/boot_as_master b/bootstrap/src/scripts/boot_as_master index 9b6d281a16..60081b2485 100755 --- a/bootstrap/src/scripts/boot_as_master +++ b/bootstrap/src/scripts/boot_as_master @@ -19,8 +19,8 @@ set -x # Source functions . /opt/cfncluster/scripts/functions.shlib -# Check DNS -check_dns || error_exit "DNS checks failed." +# Check basic DNS +check_dns_ns || error_exit "Basic DNS checks failed." # Run preinstall script if defined run_preinstall diff --git a/bootstrap/src/scripts/functions.shlib b/bootstrap/src/scripts/functions.shlib index f3c6bb095c..9913feeb6c 100755 --- a/bootstrap/src/scripts/functions.shlib +++ b/bootstrap/src/scripts/functions.shlib @@ -15,8 +15,25 @@ function error_exit () { exit 1 } +# Basic DNS check, using dnsdomain and checking for NS record +function check_dns_ns () { + TRY=0 + domain=$(dnsdomainname) + while [ $TRY -lt 3 ]; do + host -t ns $domain >/dev/null + check_ns=$? + if [ $check_ns -eq 0 ]; then + break + else + sleep 10 + TRY=$(( $TRY + 1 )) + fi + done + return $check_ns +} + # Check DNS is working, as it is required for correct operation -function check_dns () { +function check_dns_fqdn () { TRY=0 while [ $TRY -lt 3 ]; do hostname -f >/dev/null diff --git a/bootstrap/src/scripts/sge/boot_as_compute b/bootstrap/src/scripts/sge/boot_as_compute index f1a0666a08..ea6eaf8b47 100755 --- a/bootstrap/src/scripts/sge/boot_as_compute +++ b/bootstrap/src/scripts/sge/boot_as_compute @@ -15,6 +15,9 @@ set -x . /opt/cfncluster/cfnconfig +# Source functions +. ../functions.shlib + function error_exit () { script=`basename $0` echo "cfncluster: $script - $1" @@ -22,6 +25,10 @@ function error_exit () { exit 1 } +# Check FQDN dns +check_dns_fqdn || error_exit "FQDN DNS checks failed." + + if [ "x$cfn_master" == "x" ]; then error_exit 'No master server specified.' usage diff --git a/bootstrap/src/scripts/sge/boot_as_master b/bootstrap/src/scripts/sge/boot_as_master index 694844dda3..584c7be475 100755 --- a/bootstrap/src/scripts/sge/boot_as_master +++ b/bootstrap/src/scripts/sge/boot_as_master @@ -16,6 +16,9 @@ set -x # Source cfncluster config . /opt/cfncluster/cfnconfig +# Source functions +. ../functions.shlib + function error_exit () { script=`basename $0` echo "cfncluster: $script - $1" @@ -23,6 +26,9 @@ function error_exit () { exit 1 } +# Check FQDN dns +check_dns_fqdn || error_exit "FQDN DNS checks failed." + myhostname=$(hostname -s) if [ $? != 0 ]; then error_exit 'Failed to determine local hostname' From 5ecef4fc0beb2550386fd23dc11907bed5559249 Mon Sep 17 00:00:00 2001 From: Dougal Ballantyne Date: Sun, 21 Dec 2014 15:26:21 -0800 Subject: [PATCH 08/13] Fixing module imports and global variables --- node/nodewatcher/nodewatcher.py | 2 +- node/sqswatcher/sqswatcher.py | 7 ++++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/node/nodewatcher/nodewatcher.py b/node/nodewatcher/nodewatcher.py index cde72e8ad2..ea5b584a24 100755 --- a/node/nodewatcher/nodewatcher.py +++ b/node/nodewatcher/nodewatcher.py @@ -84,7 +84,7 @@ def getHostname(): def loadSchedulerModule(scheduler): print 'running loadSchedulerModule' - scheduler = 'plugins.' + scheduler + scheduler = 'nodewatcher.plugins.' + scheduler _scheduler = __import__(scheduler) _scheduler = sys.modules[scheduler] diff --git a/node/sqswatcher/sqswatcher.py b/node/sqswatcher/sqswatcher.py index db765e0ec3..eff4b13522 100755 --- a/node/sqswatcher/sqswatcher.py +++ b/node/sqswatcher/sqswatcher.py @@ -76,14 +76,14 @@ def setupDDBTable(region, table_name): def loadSchedulerModule(scheduler): print 'running loadSchedulerModule' - scheduler = 'plugins.' + scheduler + scheduler = 'sqswatcher.plugins.' + scheduler _scheduler = __import__(scheduler) _scheduler = sys.modules[scheduler] return _scheduler -def pollQueue(): +def pollQueue(scheduler, q, t): print 'running pollQueue' s = loadSchedulerModule(scheduler) @@ -158,10 +158,11 @@ def pollQueue(): def main(): print('running __main__') print time.ctime() + global region, cluster_user region, sqsqueue, table_name, scheduler, cluster_user = getConfig() q = setupQueue(region, sqsqueue) t = setupDDBTable(region, table_name) - pollQueue() + pollQueue(scheduler, q, t) if __name__ == "__main__": main() From 26009832b0998840dfa3b22c7b7eb3192581be24 Mon Sep 17 00:00:00 2001 From: Dougal Ballantyne Date: Sun, 21 Dec 2014 15:27:24 -0800 Subject: [PATCH 09/13] New version --- node/setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/node/setup.py b/node/setup.py index 2f6be2a138..568dd750f9 100644 --- a/node/setup.py +++ b/node/setup.py @@ -21,7 +21,7 @@ def read(fname): console_scripts = ['sqswatcher = sqswatcher.sqswatcher:main', 'nodewatcher = nodewatcher.nodewatcher:main'] -version = "0.0.1" +version = "0.0.2" requires = ['boto>=2.34', 'paramiko', 'python-dateutil'] if sys.version_info[:2] == (2, 6): From 436236ff626cc907de4ecefbbc459172556ace93 Mon Sep 17 00:00:00 2001 From: Dougal Ballantyne Date: Mon, 22 Dec 2014 08:41:59 -0800 Subject: [PATCH 10/13] Fixing proxy_args to use both http and https --- cloudformation/cfncluster.cfn.json | 145 +++++++++++++++++------------ 1 file changed, 84 insertions(+), 61 deletions(-) diff --git a/cloudformation/cfncluster.cfn.json b/cloudformation/cfncluster.cfn.json index ee7f488827..c0df2e934c 100644 --- a/cloudformation/cfncluster.cfn.json +++ b/cloudformation/cfncluster.cfn.json @@ -4,11 +4,7 @@ "Parameters" : { "KeyName" : { "Description" : "Name of an existing EC2 KeyPair to enable SSH access to the instances", - "Type" : "String", - "MinLength" : "1", - "MaxLength" : "64", - "AllowedPattern" : "[-+_.@ a-zA-Z0-9]+", - "ConstraintDescription" : "can contain only alphanumeric characters, spaces, dashes, plusses, underscores, dots, and at signs." + "Type" : "AWS::EC2::KeyPair::KeyName" }, "MasterInstanceType" : { "Description" : "Master Server EC2 instance type", @@ -164,7 +160,7 @@ }, "VPCId" : { "Description" : "ID of the VPC you want to provision cluster into. Only used with UseVPCBase=false", - "Type" : "String" + "Type" : "AWS::EC2::VPC::Id" }, "SSHFrom" : { "Description" : "Lockdown SSH access (default can be accessed from anywhere)", @@ -321,6 +317,14 @@ "torque", "test" ] + }, + "SharedDir" : { + "Description" : "The path/mountpoint for the shared drive", + "Type" : "String", + "Default" : "/shared" + }, + "CLITemplate" : { + "Type" : "String" } }, "Conditions" : { @@ -1051,13 +1055,10 @@ }, "Condition" : "UseS3ReadWritePolicy" }, - "MasterIPAddress" : { + "MasterEIP" : { "Type" : "AWS::EC2::EIP", "Properties" : { - "Domain" : "vpc", - "InstanceId" : { - "Ref" : "MasterServer" - } + "Domain" : "vpc" }, "Condition" : "MasterPublicIp" }, @@ -1108,11 +1109,15 @@ "#!/bin/bash\n\n", "function error_exit\n", "{\n", - " cfn-signal ${proxy_args} -e 1 -r \"$1\" '", + " cfn-signal ${proxy_args} --exit-code=1 --reason=\"$1\" --stack=", { - "Ref" : "MasterServerWaitHandle" + "Ref" : "AWS::StackName" + }, + " --resource=MasterServer --region=", + { + "Ref" : "AWS::Region" }, - "'\n", + "\n", " exit 1\n", "}\n", "proxy=", @@ -1121,7 +1126,10 @@ }, "\n", "if [ \"$proxy\" != \"NONE\" ]; then\n", - " proxy_args=\"--http-proxy=${proxy}\"\n", + " proxy_args=\"--http-proxy=${proxy} --https-proxy=${proxy}\"\n", + " proxy_host=$(echo \"$proxy\" | awk -F/ '{print $3}' | cut -d: -f1)\n", + " proxy_port=$(echo \"$proxy\" | awk -F/ '{print $3}' | cut -d: -f2)\n", + " echo -e \"[Boto]\nproxy = ${proxy_host}\nproxy_port = ${proxy_port}\n\" >/etc/boto.cfg\n", "else\n", " proxy_args=\"\"\n", "fi\n", @@ -1153,11 +1161,15 @@ }, " -c /tmp/cwlogs/cfn-logs.conf || error_exit 'Failed to run CloudWatch Logs agent setup'\n", "fi\n", - "cfn-signal ${proxy_args} -e 0 -r \"MasterServer setup complete\" '", + "cfn-signal ${proxy_args} --exit-code=0 --reason=\"MasterServer setup complete\" --stack=", + { + "Ref" : "AWS::StackName" + }, + " --resource=MasterServer --region=", { - "Ref" : "MasterServerWaitHandle" + "Ref" : "AWS::Region" }, - "'\n", + "\n", "# End of file\n" ] ] @@ -1346,7 +1358,7 @@ "owner" : "root", "group" : "root" }, - "/opt/cfncluster/sqswatcher/sqswatcher.cfg" : { + "/etc/sqswatcher.cfg" : { "content" : { "Fn::Join" : [ "", @@ -1393,14 +1405,13 @@ "owner" : "root", "group" : "root" }, - "/opt/cfncluster/cfncluster_supervisord.conf" : { + "/etc/cfncluster/cfncluster_supervisord.conf" : { "content" : { "Fn::Join" : [ "", [ "[program:sqswatcher]\n", - "directory = /opt/cfncluster/sqswatcher\n", - "command = python ./sqswatcher.py\n" + "command = /usr/bin/sqswatcher\n" ] ] }, @@ -1418,20 +1429,14 @@ } } } + }, + "CreationPolicy" : { + "ResourceSignal" : { + "Count" : "1", + "Timeout" : "PT30M" + } } }, - "MasterServerWaitCondition" : { - "Type" : "AWS::CloudFormation::WaitCondition", - "Properties" : { - "Handle" : { - "Ref" : "MasterServerWaitHandle" - }, - "Timeout" : "1800" - } - }, - "MasterServerWaitHandle" : { - "Type" : "AWS::CloudFormation::WaitConditionHandle" - }, "ComputeFleet" : { "Type" : "AWS::AutoScaling::AutoScalingGroup", "Properties" : { @@ -1506,7 +1511,15 @@ ] } }, - "DependsOn" : "MasterServerWaitCondition" + "DependsOn" : "MasterServer", + "CreationPolicy" : { + "ResourceSignal" : { + "Timeout" : "PT30M", + "Count" : { + "Ref" : "ComputeWaitConditionCount" + } + } + } }, "ComputeServerLaunchConfig" : { "Type" : "AWS::AutoScaling::LaunchConfiguration", @@ -1555,11 +1568,15 @@ "#!/bin/bash\n", "function error_exit\n", "{\n", - " cfn-signal ${proxy_args} -e 1 -r \"$1\" '", + " cfn-signal ${proxy_args} --exit-code=1 --reason=\"$1\" --stack=", { - "Ref" : "ComputeWaitHandle" + "Ref" : "AWS::StackName" }, - "'\n", + " --resource=ComputeFleet --region=", + { + "Ref" : "AWS::Region" + }, + "\n", " exit 1\n", "}\n", "proxy=", @@ -1568,7 +1585,10 @@ }, "\n", "if [ \"$proxy\" != \"NONE\" ]; then\n", - " proxy_args=\"--http-proxy=${proxy}\"\n", + " proxy_args=\"--http-proxy=${proxy} --https-proxy=${proxy}\"\n", + " proxy_host=$(echo \"$proxy\" | awk -F/ '{print $3}' | cut -d: -f1)\n", + " proxy_port=$(echo \"$proxy\" | awk -F/ '{print $3}' | cut -d: -f2)\n", + " echo -e \"[Boto]\nproxy = ${proxy_host}\nproxy_port = ${proxy_port}\n\" >/etc/boto.cfg\n", "else\n", " proxy_args=\"\"\n", "fi\n", @@ -1601,11 +1621,15 @@ }, " -c /tmp/cwlogs/cfn-logs.conf || error_exit 'Failed to run CloudWatch Logs agent setup'\n", "fi\n", - "cfn-signal ${proxy_args} -e 0 -r \"Compute setup complete\" '", + "cfn-signal ${proxy_args} --exit-code=0 --reason=\"Compute setup complete\" --stack=", { - "Ref" : "ComputeWaitHandle" + "Ref" : "AWS::StackName" }, - "'\n" + " --resource=ComputeFleet --region=", + { + "Ref" : "AWS::Region" + }, + "\n" ] ] } @@ -1733,7 +1757,7 @@ "owner" : "root", "group" : "root" }, - "/opt/cfncluster/nodewatcher/nodewatcher.cfg" : { + "/etc/nodewatcher.cfg" : { "content" : { "Fn::Join" : [ "", @@ -1756,14 +1780,13 @@ "owner" : "root", "group" : "root" }, - "/opt/cfncluster/cfncluster_supervisord.conf" : { + "/etc/cfncluster/cfncluster_supervisord.conf" : { "content" : { "Fn::Join" : [ "", [ "[program:nodewatcher]\n", - "directory = /opt/cfncluster/nodewatcher\n", - "command = python ./nodewatcher.py\n" + "command = /usr/bin/nodewatcher\n" ] ] }, @@ -1830,21 +1853,6 @@ } } }, - "ComputeWaitHandle" : { - "Type" : "AWS::CloudFormation::WaitConditionHandle" - }, - "ComputeWaitCondition" : { - "Type" : "AWS::CloudFormation::WaitCondition", - "Properties" : { - "Handle" : { - "Ref" : "ComputeWaitHandle" - }, - "Timeout" : "900", - "Count" : { - "Ref" : "ComputeWaitConditionCount" - } - } - }, "ScaleUpPolicy2" : { "Type" : "AWS::AutoScaling::ScalingPolicy", "Properties" : { @@ -2152,6 +2160,21 @@ "RetentionInDays" : 7 }, "Condition" : "CloudWatchLogs" + }, + "AssosiateEIP" : { + "Type" : "AWS::EC2::EIPAssociation", + "Properties" : { + "AllocationId" : { + "Fn::GetAtt" : [ + "MasterEIP", + "AllocationId" + ] + }, + "NetworkInterfaceId" : { + "Ref" : "MasterENI" + } + }, + "Condition" : "MasterPublicIp" } }, "Outputs" : { From 118f7ed6d589e29298a3d5b6f1c033ff79b965a2 Mon Sep 17 00:00:00 2001 From: Dougal Ballantyne Date: Mon, 22 Dec 2014 11:36:00 -0800 Subject: [PATCH 11/13] Adding cfn_proxy variable, CLITemplate parameters and small sleep for deletes. --- cli/cfncluster/cfncluster.py | 1 + cli/cfncluster/cfnconfig.py | 1 + cloudformation/cfncluster.cfn.json | 10 ++++++++++ 3 files changed, 12 insertions(+) diff --git a/cli/cfncluster/cfncluster.py b/cli/cfncluster/cfncluster.py index dfe2f08158..5279105882 100644 --- a/cli/cfncluster/cfncluster.py +++ b/cli/cfncluster/cfncluster.py @@ -305,6 +305,7 @@ def delete(args): aws_secret_access_key=config.aws_secret_access_key) try: cfnconn.delete_stack(stack) + time.sleep(5) status = cfnconn.describe_stacks(stack)[0].stack_status sys.stdout.write('\rStatus: %s' % status) sys.stdout.flush() diff --git a/cli/cfncluster/cfnconfig.py b/cli/cfncluster/cfnconfig.py index 0517779c4d..339eb76a22 100644 --- a/cli/cfncluster/cfnconfig.py +++ b/cli/cfncluster/cfnconfig.py @@ -59,6 +59,7 @@ def __init__(self, args): except AttributeError: self.__cluster_template = __config.get('global', 'cluster_template') self.__cluster_section = ('cluster %s' % self.__cluster_template) + self.parameters.append(('CLITemplate',self.__cluster_template)) # Check if package updates should be checked try: diff --git a/cloudformation/cfncluster.cfn.json b/cloudformation/cfncluster.cfn.json index c0df2e934c..84633a8232 100644 --- a/cloudformation/cfncluster.cfn.json +++ b/cloudformation/cfncluster.cfn.json @@ -1339,6 +1339,11 @@ "Ref" : "EphemeralDir" }, "\n", + "cfn_proxy=", + { + "Ref" : "ProxyServer" + }, + "\n", "cfn_node_type=MasterServer\n", "cfn_cluster_user=", { @@ -1720,6 +1725,11 @@ "Ref" : "SQS" }, "\n", + "cfn_proxy=", + { + "Ref" : "ProxyServer" + }, + "\n", "cfn_master=", { "Fn::GetAtt" : [ From 58b5647c80f6a22c36bb3a602c6ce55aa19976ed Mon Sep 17 00:00:00 2001 From: Dougal Ballantyne Date: Mon, 22 Dec 2014 14:51:21 -0800 Subject: [PATCH 12/13] New AMIs. Addtional CloudWatch logs regions. --- amis.txt | 20 +++++++++--------- cloudformation/cfncluster.cfn.json | 34 +++++++++++++++--------------- 2 files changed, 27 insertions(+), 27 deletions(-) diff --git a/amis.txt b/amis.txt index d0e31a6e48..2042e1e37a 100644 --- a/amis.txt +++ b/amis.txt @@ -1,10 +1,10 @@ -us-west-2 ami-ef632ddf -eu-central-1 ami-1c132501 -sa-east-1 ami-d3d364ce -ap-northeast-1 ami-2188bb20 -eu-west-1 ami-3470d943 -us-east-1 ami-f4ea6b9c -us-west-1 ami-c30e1a86 -ap-southeast-2 ami-537e1269 -ap-southeast-1 ami-0c70505e -us-gov-west-1 ami-8f1573ac +us-west-2 ami-2b095a1b +eu-central-1 ami-b03606ad +sa-east-1 ami-f18635ec +ap-northeast-1 ami-f2171ff3 +eu-west-1 ami-c215adb5 +us-east-1 ami-0e274866 +us-west-1 ami-abddc0ee +ap-southeast-2 ami-a5701b9f +ap-southeast-1 ami-839bb4d1 +us-gov-west-1 ami-9196f0b2 diff --git a/cloudformation/cfncluster.cfn.json b/cloudformation/cfncluster.cfn.json index 84633a8232..2444b19e4b 100644 --- a/cloudformation/cfncluster.cfn.json +++ b/cloudformation/cfncluster.cfn.json @@ -1,6 +1,6 @@ { "AWSTemplateFormatVersion" : "2010-09-09", - "Description" : "AWS CloudFormation Sample Template cfncluster.cfn.json: Sample template showing an framework for deploying master + compute type clusters on AWS. **WARNING** This template creates AWS resources. You will be billed for the AWS resources used if you create a stack from this template. Version: ami-20141013-1 cfn-20141013-0", + "Description" : "AWS CloudFormation Sample Template cfncluster.cfn.json: Sample template showing an framework for deploying master + compute type clusters on AWS. **WARNING** This template creates AWS resources. You will be billed for the AWS resources used if you create a stack from this template. Version: ami-20141222-0 cfncluster-0.0.17", "Parameters" : { "KeyName" : { "Description" : "Name of an existing EC2 KeyPair to enable SSH access to the instances", @@ -650,34 +650,34 @@ }, "AWSRegionOS2AMI" : { "eu-west-1" : { - "centos6" : "ami-3470d943" + "centos6" : "ami-c215adb5" }, "us-east-1" : { - "centos6" : "ami-f4ea6b9c" + "centos6" : "ami-0e274866" }, "ap-northeast-1" : { - "centos6" : "ami-2188bb20" + "centos6" : "ami-f2171ff3" }, "us-west-2" : { - "centos6" : "ami-ef632ddf" + "centos6" : "ami-2b095a1b" }, "sa-east-1" : { - "centos6" : "ami-d3d364ce" + "centos6" : "ami-f18635ec" }, "us-west-1" : { - "centos6" : "ami-c30e1a86" + "centos6" : "ami-abddc0ee" }, "ap-southeast-1" : { - "centos6" : "ami-0c70505e" + "centos6" : "ami-839bb4d1" }, "ap-southeast-2" : { - "centos6" : "ami-537e1269" + "centos6" : "ami-a5701b9f" }, "eu-central-1" : { - "centos6" : "ami-1c132501" + "centos6" : "ami-b03606ad" }, "us-gov-west-1" : { - "centos6" : "ami-8f1573ac" + "centos6" : "ami-9196f0b2" } }, "ClusterUser" : { @@ -690,7 +690,7 @@ }, "AWSRegion2Capabilites" : { "eu-west-1" : { - "cwl" : "false", + "cwl" : "true", "arn" : "aws" }, "us-east-1" : { @@ -698,11 +698,11 @@ "arn" : "aws" }, "ap-northeast-1" : { - "cwl" : "false", + "cwl" : "true", "arn" : "aws" }, "us-west-2" : { - "cwl" : "false", + "cwl" : "true", "arn" : "aws" }, "sa-east-1" : { @@ -714,15 +714,15 @@ "arn" : "aws" }, "ap-southeast-1" : { - "cwl" : "false", + "cwl" : "true", "arn" : "aws" }, "ap-southeast-2" : { - "cwl" : "false", + "cwl" : "true", "arn" : "aws" }, "eu-central-1" : { - "cwl" : "false", + "cwl" : "true", "arn" : "aws" }, "us-gov-west-1" : { From 9975aa4446db4da5531c64106f43c7c9bd601a21 Mon Sep 17 00:00:00 2001 From: Dougal Ballantyne Date: Mon, 22 Dec 2014 14:55:36 -0800 Subject: [PATCH 13/13] Staging 0.0.17 release to develop --- CHANGELOG.rst | 8 ++++++++ README.md | 2 +- cli/setup.py | 2 +- 3 files changed, 10 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index cfa5005d86..08455de135 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -2,6 +2,14 @@ CHANGELOG ========= +0.0.17 +====== +* updates:``ami``: Pulled latest CentOS errata. Now CentOS 6.6. +* updates:``ami``: Updated SGE to 8.1.6 +* updates:``ami``: Updates openlava to latest pull from GitHub +* bugfix:``ami``: Fixed handling of HTTP(S) proxies +* feature:``ami``: Moved sqswatcher and nodewatcher into Python package cfncluster-node + 0.0.16 ====== * feature:``cfncluster``: Support for GovCloud region diff --git a/README.md b/README.md index 993ecc2af3..4b32cb4e54 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,7 @@ everything is done using CloudFormation or resources within AWS. ### Installation -The current working version is cfncluster-0.0.16. The CLI is written in python and uses BOTO for AWS actions. You can install the CLI with the following command: +The current working version is cfncluster-0.0.17. The CLI is written in python and uses BOTO for AWS actions. You can install the CLI with the following command: #### Linux/OSX diff --git a/cli/setup.py b/cli/setup.py index f481403ec3..e8f4a11dab 100644 --- a/cli/setup.py +++ b/cli/setup.py @@ -20,7 +20,7 @@ def read(fname): return open(os.path.join(os.path.dirname(__file__), fname)).read() console_scripts = ['cfncluster = cfncluster.cli:main'] -version = "0.0.99" +version = "0.0.17" requires = ['boto>=2.34'] if sys.version_info[:2] == (2, 6):