diff --git a/.gitignore b/.gitignore index 7e14c4f044..df31d307d3 100644 --- a/.gitignore +++ b/.gitignore @@ -5,5 +5,5 @@ *# dist/ build/ -cfncluster.egg-info/ -.idea/ \ No newline at end of file +*.egg-info/ +.idea/ diff --git a/CHANGELOG.rst b/CHANGELOG.rst index cfa5005d86..08455de135 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -2,6 +2,14 @@ CHANGELOG ========= +0.0.17 +====== +* updates:``ami``: Pulled latest CentOS errata. Now CentOS 6.6. +* updates:``ami``: Updated SGE to 8.1.6 +* updates:``ami``: Updates openlava to latest pull from GitHub +* bugfix:``ami``: Fixed handling of HTTP(S) proxies +* feature:``ami``: Moved sqswatcher and nodewatcher into Python package cfncluster-node + 0.0.16 ====== * feature:``cfncluster``: Support for GovCloud region diff --git a/README.md b/README.md index 993ecc2af3..4b32cb4e54 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,7 @@ everything is done using CloudFormation or resources within AWS. ### Installation -The current working version is cfncluster-0.0.16. The CLI is written in python and uses BOTO for AWS actions. You can install the CLI with the following command: +The current working version is cfncluster-0.0.17. The CLI is written in python and uses BOTO for AWS actions. You can install the CLI with the following command: #### Linux/OSX diff --git a/amis.txt b/amis.txt index d0e31a6e48..2042e1e37a 100644 --- a/amis.txt +++ b/amis.txt @@ -1,10 +1,10 @@ -us-west-2 ami-ef632ddf -eu-central-1 ami-1c132501 -sa-east-1 ami-d3d364ce -ap-northeast-1 ami-2188bb20 -eu-west-1 ami-3470d943 -us-east-1 ami-f4ea6b9c -us-west-1 ami-c30e1a86 -ap-southeast-2 ami-537e1269 -ap-southeast-1 ami-0c70505e -us-gov-west-1 ami-8f1573ac +us-west-2 ami-2b095a1b +eu-central-1 ami-b03606ad +sa-east-1 ami-f18635ec +ap-northeast-1 ami-f2171ff3 +eu-west-1 ami-c215adb5 +us-east-1 ami-0e274866 +us-west-1 ami-abddc0ee +ap-southeast-2 ami-a5701b9f +ap-southeast-1 ami-839bb4d1 +us-gov-west-1 ami-9196f0b2 diff --git a/bootstrap/src/scripts/boot_as_compute b/bootstrap/src/scripts/boot_as_compute index 9532053f16..56164382d4 100755 --- a/bootstrap/src/scripts/boot_as_compute +++ b/bootstrap/src/scripts/boot_as_compute @@ -19,8 +19,8 @@ set -x # Source functions . /opt/cfncluster/scripts/functions.shlib -# Check DNS -check_dns || error_exit "DNS checks failed." +# Check basic DNS +check_dns_ns || error_exit "Basic DNS checks failed." # Run preinstall script if defined run_preinstall diff --git a/bootstrap/src/scripts/boot_as_master b/bootstrap/src/scripts/boot_as_master index 9b6d281a16..60081b2485 100755 --- a/bootstrap/src/scripts/boot_as_master +++ b/bootstrap/src/scripts/boot_as_master @@ -19,8 +19,8 @@ set -x # Source functions . /opt/cfncluster/scripts/functions.shlib -# Check DNS -check_dns || error_exit "DNS checks failed." +# Check basic DNS +check_dns_ns || error_exit "Basic DNS checks failed." # Run preinstall script if defined run_preinstall diff --git a/bootstrap/src/scripts/functions.shlib b/bootstrap/src/scripts/functions.shlib index f3c6bb095c..9913feeb6c 100755 --- a/bootstrap/src/scripts/functions.shlib +++ b/bootstrap/src/scripts/functions.shlib @@ -15,8 +15,25 @@ function error_exit () { exit 1 } +# Basic DNS check, using dnsdomain and checking for NS record +function check_dns_ns () { + TRY=0 + domain=$(dnsdomainname) + while [ $TRY -lt 3 ]; do + host -t ns $domain >/dev/null + check_ns=$? + if [ $check_ns -eq 0 ]; then + break + else + sleep 10 + TRY=$(( $TRY + 1 )) + fi + done + return $check_ns +} + # Check DNS is working, as it is required for correct operation -function check_dns () { +function check_dns_fqdn () { TRY=0 while [ $TRY -lt 3 ]; do hostname -f >/dev/null diff --git a/bootstrap/src/scripts/sge/boot_as_compute b/bootstrap/src/scripts/sge/boot_as_compute index f1a0666a08..ea6eaf8b47 100755 --- a/bootstrap/src/scripts/sge/boot_as_compute +++ b/bootstrap/src/scripts/sge/boot_as_compute @@ -15,6 +15,9 @@ set -x . /opt/cfncluster/cfnconfig +# Source functions +. ../functions.shlib + function error_exit () { script=`basename $0` echo "cfncluster: $script - $1" @@ -22,6 +25,10 @@ function error_exit () { exit 1 } +# Check FQDN dns +check_dns_fqdn || error_exit "FQDN DNS checks failed." + + if [ "x$cfn_master" == "x" ]; then error_exit 'No master server specified.' usage diff --git a/bootstrap/src/scripts/sge/boot_as_master b/bootstrap/src/scripts/sge/boot_as_master index 694844dda3..584c7be475 100755 --- a/bootstrap/src/scripts/sge/boot_as_master +++ b/bootstrap/src/scripts/sge/boot_as_master @@ -16,6 +16,9 @@ set -x # Source cfncluster config . /opt/cfncluster/cfnconfig +# Source functions +. ../functions.shlib + function error_exit () { script=`basename $0` echo "cfncluster: $script - $1" @@ -23,6 +26,9 @@ function error_exit () { exit 1 } +# Check FQDN dns +check_dns_fqdn || error_exit "FQDN DNS checks failed." + myhostname=$(hostname -s) if [ $? != 0 ]; then error_exit 'Failed to determine local hostname' diff --git a/cli/cfncluster/cfncluster.py b/cli/cfncluster/cfncluster.py index ae10d6527a..5279105882 100644 --- a/cli/cfncluster/cfncluster.py +++ b/cli/cfncluster/cfncluster.py @@ -261,7 +261,7 @@ def status(args): sys.stdout.flush() if not args.nowait: while ((status != 'CREATE_COMPLETE') and (status != 'UPDATE_COMPLETE') - and (status != 'ROLLBACK_COMPLETE') and (status != 'CREATE_FAILED')): + and (status != 'ROLLBACK_COMPLETE') and (status != 'CREATE_FAILED') and (status != 'DELETE_FAILED')): time.sleep(5) status = cfnconn.describe_stacks(stack)[0].stack_status events = cfnconn.describe_stack_events(stack)[0] @@ -274,11 +274,12 @@ def status(args): outputs = cfnconn.describe_stacks(stack)[0].outputs for output in outputs: print output - elif ((status == 'ROLLBACK_COMPLETE') or (status == 'CREATE_FAILED')): + elif ((status == 'ROLLBACK_COMPLETE') or (status == 'CREATE_FAILED') or (status == 'DELETE_FAILED')): events = cfnconn.describe_stack_events(stack) for event in events: - if event.resource_status == 'CREATE_FAILED': - print event.timestamp, event.resource_status, event.resource_type, event.logical_resource_id, event.resource_status_reason + if ((event.resource_status == 'CREATE_FAILED') or (event.resource_status == 'DELETE_FAILED')): + print event.timestamp, event.resource_status, event.resource_type, event.logical_resource_id, \ + event.resource_status_reason else: sys.stdout.write('\n') sys.stdout.flush() @@ -304,6 +305,7 @@ def delete(args): aws_secret_access_key=config.aws_secret_access_key) try: cfnconn.delete_stack(stack) + time.sleep(5) status = cfnconn.describe_stacks(stack)[0].stack_status sys.stdout.write('\rStatus: %s' % status) sys.stdout.flush() diff --git a/cli/cfncluster/cfnconfig.py b/cli/cfncluster/cfnconfig.py index 0517779c4d..339eb76a22 100644 --- a/cli/cfncluster/cfnconfig.py +++ b/cli/cfncluster/cfnconfig.py @@ -59,6 +59,7 @@ def __init__(self, args): except AttributeError: self.__cluster_template = __config.get('global', 'cluster_template') self.__cluster_section = ('cluster %s' % self.__cluster_template) + self.parameters.append(('CLITemplate',self.__cluster_template)) # Check if package updates should be checked try: diff --git a/cli/setup.py b/cli/setup.py index 5d5e9f1cf9..e8f4a11dab 100644 --- a/cli/setup.py +++ b/cli/setup.py @@ -20,7 +20,7 @@ def read(fname): return open(os.path.join(os.path.dirname(__file__), fname)).read() console_scripts = ['cfncluster = cfncluster.cli:main'] -version = "0.0.16" +version = "0.0.17" requires = ['boto>=2.34'] if sys.version_info[:2] == (2, 6): diff --git a/cloudformation/cfncluster.cfn.json b/cloudformation/cfncluster.cfn.json index 7bce8bfdee..2444b19e4b 100644 --- a/cloudformation/cfncluster.cfn.json +++ b/cloudformation/cfncluster.cfn.json @@ -1,14 +1,10 @@ { "AWSTemplateFormatVersion" : "2010-09-09", - "Description" : "AWS CloudFormation Sample Template cfncluster.cfn.json: Sample template showing an framework for deploying master + compute type clusters on AWS. **WARNING** This template creates AWS resources. You will be billed for the AWS resources used if you create a stack from this template. Version: ami-20141013-1 cfn-20141013-0", + "Description" : "AWS CloudFormation Sample Template cfncluster.cfn.json: Sample template showing an framework for deploying master + compute type clusters on AWS. **WARNING** This template creates AWS resources. You will be billed for the AWS resources used if you create a stack from this template. Version: ami-20141222-0 cfncluster-0.0.17", "Parameters" : { "KeyName" : { "Description" : "Name of an existing EC2 KeyPair to enable SSH access to the instances", - "Type" : "String", - "MinLength" : "1", - "MaxLength" : "64", - "AllowedPattern" : "[-+_.@ a-zA-Z0-9]+", - "ConstraintDescription" : "can contain only alphanumeric characters, spaces, dashes, plusses, underscores, dots, and at signs." + "Type" : "AWS::EC2::KeyPair::KeyName" }, "MasterInstanceType" : { "Description" : "Master Server EC2 instance type", @@ -164,7 +160,7 @@ }, "VPCId" : { "Description" : "ID of the VPC you want to provision cluster into. Only used with UseVPCBase=false", - "Type" : "String" + "Type" : "AWS::EC2::VPC::Id" }, "SSHFrom" : { "Description" : "Lockdown SSH access (default can be accessed from anywhere)", @@ -321,6 +317,14 @@ "torque", "test" ] + }, + "SharedDir" : { + "Description" : "The path/mountpoint for the shared drive", + "Type" : "String", + "Default" : "/shared" + }, + "CLITemplate" : { + "Type" : "String" } }, "Conditions" : { @@ -646,34 +650,34 @@ }, "AWSRegionOS2AMI" : { "eu-west-1" : { - "centos6" : "ami-3470d943" + "centos6" : "ami-c215adb5" }, "us-east-1" : { - "centos6" : "ami-f4ea6b9c" + "centos6" : "ami-0e274866" }, "ap-northeast-1" : { - "centos6" : "ami-2188bb20" + "centos6" : "ami-f2171ff3" }, "us-west-2" : { - "centos6" : "ami-ef632ddf" + "centos6" : "ami-2b095a1b" }, "sa-east-1" : { - "centos6" : "ami-d3d364ce" + "centos6" : "ami-f18635ec" }, "us-west-1" : { - "centos6" : "ami-c30e1a86" + "centos6" : "ami-abddc0ee" }, "ap-southeast-1" : { - "centos6" : "ami-0c70505e" + "centos6" : "ami-839bb4d1" }, "ap-southeast-2" : { - "centos6" : "ami-537e1269" + "centos6" : "ami-a5701b9f" }, "eu-central-1" : { - "centos6" : "ami-1c132501" + "centos6" : "ami-b03606ad" }, "us-gov-west-1" : { - "centos6" : "ami-8f1573ac" + "centos6" : "ami-9196f0b2" } }, "ClusterUser" : { @@ -686,7 +690,7 @@ }, "AWSRegion2Capabilites" : { "eu-west-1" : { - "cwl" : "false", + "cwl" : "true", "arn" : "aws" }, "us-east-1" : { @@ -694,11 +698,11 @@ "arn" : "aws" }, "ap-northeast-1" : { - "cwl" : "false", + "cwl" : "true", "arn" : "aws" }, "us-west-2" : { - "cwl" : "false", + "cwl" : "true", "arn" : "aws" }, "sa-east-1" : { @@ -710,15 +714,15 @@ "arn" : "aws" }, "ap-southeast-1" : { - "cwl" : "false", + "cwl" : "true", "arn" : "aws" }, "ap-southeast-2" : { - "cwl" : "false", + "cwl" : "true", "arn" : "aws" }, "eu-central-1" : { - "cwl" : "false", + "cwl" : "true", "arn" : "aws" }, "us-gov-west-1" : { @@ -1051,13 +1055,10 @@ }, "Condition" : "UseS3ReadWritePolicy" }, - "MasterIPAddress" : { + "MasterEIP" : { "Type" : "AWS::EC2::EIP", "Properties" : { - "Domain" : "vpc", - "InstanceId" : { - "Ref" : "MasterServer" - } + "Domain" : "vpc" }, "Condition" : "MasterPublicIp" }, @@ -1108,11 +1109,15 @@ "#!/bin/bash\n\n", "function error_exit\n", "{\n", - " cfn-signal ${proxy_args} -e 1 -r \"$1\" '", + " cfn-signal ${proxy_args} --exit-code=1 --reason=\"$1\" --stack=", { - "Ref" : "MasterServerWaitHandle" + "Ref" : "AWS::StackName" + }, + " --resource=MasterServer --region=", + { + "Ref" : "AWS::Region" }, - "'\n", + "\n", " exit 1\n", "}\n", "proxy=", @@ -1121,7 +1126,10 @@ }, "\n", "if [ \"$proxy\" != \"NONE\" ]; then\n", - " proxy_args=\"--http-proxy=${proxy} --https-proxy=$proxy\"\n", + " proxy_args=\"--http-proxy=${proxy} --https-proxy=${proxy}\"\n", + " proxy_host=$(echo \"$proxy\" | awk -F/ '{print $3}' | cut -d: -f1)\n", + " proxy_port=$(echo \"$proxy\" | awk -F/ '{print $3}' | cut -d: -f2)\n", + " echo -e \"[Boto]\nproxy = ${proxy_host}\nproxy_port = ${proxy_port}\n\" >/etc/boto.cfg\n", "else\n", " proxy_args=\"\"\n", "fi\n", @@ -1153,11 +1161,15 @@ }, " -c /tmp/cwlogs/cfn-logs.conf || error_exit 'Failed to run CloudWatch Logs agent setup'\n", "fi\n", - "cfn-signal ${proxy_args} -e 0 -r \"MasterServer setup complete\" '", + "cfn-signal ${proxy_args} --exit-code=0 --reason=\"MasterServer setup complete\" --stack=", { - "Ref" : "MasterServerWaitHandle" + "Ref" : "AWS::StackName" }, - "'\n", + " --resource=MasterServer --region=", + { + "Ref" : "AWS::Region" + }, + "\n", "# End of file\n" ] ] @@ -1327,6 +1339,11 @@ "Ref" : "EphemeralDir" }, "\n", + "cfn_proxy=", + { + "Ref" : "ProxyServer" + }, + "\n", "cfn_node_type=MasterServer\n", "cfn_cluster_user=", { @@ -1346,7 +1363,7 @@ "owner" : "root", "group" : "root" }, - "/opt/cfncluster/sqswatcher/sqswatcher.cfg" : { + "/etc/sqswatcher.cfg" : { "content" : { "Fn::Join" : [ "", @@ -1393,14 +1410,13 @@ "owner" : "root", "group" : "root" }, - "/opt/cfncluster/cfncluster_supervisord.conf" : { + "/etc/cfncluster/cfncluster_supervisord.conf" : { "content" : { "Fn::Join" : [ "", [ "[program:sqswatcher]\n", - "directory = /opt/cfncluster/sqswatcher\n", - "command = python ./sqswatcher.py\n" + "command = /usr/bin/sqswatcher\n" ] ] }, @@ -1418,20 +1434,14 @@ } } } + }, + "CreationPolicy" : { + "ResourceSignal" : { + "Count" : "1", + "Timeout" : "PT30M" + } } }, - "MasterServerWaitCondition" : { - "Type" : "AWS::CloudFormation::WaitCondition", - "Properties" : { - "Handle" : { - "Ref" : "MasterServerWaitHandle" - }, - "Timeout" : "1800" - } - }, - "MasterServerWaitHandle" : { - "Type" : "AWS::CloudFormation::WaitConditionHandle" - }, "ComputeFleet" : { "Type" : "AWS::AutoScaling::AutoScalingGroup", "Properties" : { @@ -1506,7 +1516,15 @@ ] } }, - "DependsOn" : "MasterServerWaitCondition" + "DependsOn" : "MasterServer", + "CreationPolicy" : { + "ResourceSignal" : { + "Timeout" : "PT30M", + "Count" : { + "Ref" : "ComputeWaitConditionCount" + } + } + } }, "ComputeServerLaunchConfig" : { "Type" : "AWS::AutoScaling::LaunchConfiguration", @@ -1555,11 +1573,15 @@ "#!/bin/bash\n", "function error_exit\n", "{\n", - " cfn-signal ${proxy_args} -e 1 -r \"$1\" '", + " cfn-signal ${proxy_args} --exit-code=1 --reason=\"$1\" --stack=", { - "Ref" : "ComputeWaitHandle" + "Ref" : "AWS::StackName" }, - "'\n", + " --resource=ComputeFleet --region=", + { + "Ref" : "AWS::Region" + }, + "\n", " exit 1\n", "}\n", "proxy=", @@ -1568,7 +1590,10 @@ }, "\n", "if [ \"$proxy\" != \"NONE\" ]; then\n", - " proxy_args=\"--http-proxy=${proxy} --https-proxy=$proxy\"\n", + " proxy_args=\"--http-proxy=${proxy} --https-proxy=${proxy}\"\n", + " proxy_host=$(echo \"$proxy\" | awk -F/ '{print $3}' | cut -d: -f1)\n", + " proxy_port=$(echo \"$proxy\" | awk -F/ '{print $3}' | cut -d: -f2)\n", + " echo -e \"[Boto]\nproxy = ${proxy_host}\nproxy_port = ${proxy_port}\n\" >/etc/boto.cfg\n", "else\n", " proxy_args=\"\"\n", "fi\n", @@ -1601,11 +1626,15 @@ }, " -c /tmp/cwlogs/cfn-logs.conf || error_exit 'Failed to run CloudWatch Logs agent setup'\n", "fi\n", - "cfn-signal ${proxy_args} -e 0 -r \"Compute setup complete\" '", + "cfn-signal ${proxy_args} --exit-code=0 --reason=\"Compute setup complete\" --stack=", + { + "Ref" : "AWS::StackName" + }, + " --resource=ComputeFleet --region=", { - "Ref" : "ComputeWaitHandle" + "Ref" : "AWS::Region" }, - "'\n" + "\n" ] ] } @@ -1696,6 +1725,11 @@ "Ref" : "SQS" }, "\n", + "cfn_proxy=", + { + "Ref" : "ProxyServer" + }, + "\n", "cfn_master=", { "Fn::GetAtt" : [ @@ -1733,7 +1767,7 @@ "owner" : "root", "group" : "root" }, - "/opt/cfncluster/nodewatcher/nodewatcher.cfg" : { + "/etc/nodewatcher.cfg" : { "content" : { "Fn::Join" : [ "", @@ -1756,14 +1790,13 @@ "owner" : "root", "group" : "root" }, - "/opt/cfncluster/cfncluster_supervisord.conf" : { + "/etc/cfncluster/cfncluster_supervisord.conf" : { "content" : { "Fn::Join" : [ "", [ "[program:nodewatcher]\n", - "directory = /opt/cfncluster/nodewatcher\n", - "command = python ./nodewatcher.py\n" + "command = /usr/bin/nodewatcher\n" ] ] }, @@ -1830,21 +1863,6 @@ } } }, - "ComputeWaitHandle" : { - "Type" : "AWS::CloudFormation::WaitConditionHandle" - }, - "ComputeWaitCondition" : { - "Type" : "AWS::CloudFormation::WaitCondition", - "Properties" : { - "Handle" : { - "Ref" : "ComputeWaitHandle" - }, - "Timeout" : "900", - "Count" : { - "Ref" : "ComputeWaitConditionCount" - } - } - }, "ScaleUpPolicy2" : { "Type" : "AWS::AutoScaling::ScalingPolicy", "Properties" : { @@ -2152,6 +2170,21 @@ "RetentionInDays" : 7 }, "Condition" : "CloudWatchLogs" + }, + "AssosiateEIP" : { + "Type" : "AWS::EC2::EIPAssociation", + "Properties" : { + "AllocationId" : { + "Fn::GetAtt" : [ + "MasterEIP", + "AllocationId" + ] + }, + "NetworkInterfaceId" : { + "Ref" : "MasterENI" + } + }, + "Condition" : "MasterPublicIp" } }, "Outputs" : { diff --git a/docs/staging/autoscaling.txt b/docs/staging/autoscaling.txt new file mode 100644 index 0000000000..9d00951b7e --- /dev/null +++ b/docs/staging/autoscaling.txt @@ -0,0 +1,10 @@ +Clusters deployed with cfncluster are elastic in several ways. The first is by simply setting the initial_queue_size and max_queue_size parameters of a cluster settings. The initial_queue_size sets minimum size value of the ComputeFleet Auto Scaling Group(ASG) and also the desired capacity value . The max_queue_size sets maximum size value of the ComputeFleet ASG. As part of the cfncluster, two Amazon CloudWatch alarms are created. These alarms monitor a custom Amazon CloudWatch metric[1] that is published by the MasterServer of each cluster, this is the second elastic nature of cfncluster. This metric is called pending and is created per Stack and unique to each cluster. These Amazon CloudWatch alarms call ScaleUp policies associated with the ComputeFleet ASG. This is what handles the automatic addition of compute nodes when there is pending tasks in the cluster. It is actually capable to scaling the cluster with zero compute nodes until the alarms no longer trigger or the max_queue_size is reached. + +Within AutoScaling, there is typically a Amazon CloudWatch alarm to remove instances when no longer needed. This alarm would operate on a aggregate metric such as CPU or network. When the aggregate metric fell below a certain level, it would make a call to a ScaleDown policy. The decision to remove which instance is complex[2] and is not aware of individual instance utilization. For that reason, each one of the instances in the ComputeFleet ASG run a process called nodewatcher[3]. The purpose of this process is to monitor the instance and if idle AND close to the end of the current hour, remove it from the ComputeFleet ASG. It specifically calls the TerminateInstanceInAutoScalingGroup[4] API call, which will remove an instance as long as the size of the ASG is larger than the desired capacity. That is what handles the scale down of the cluster, without affecting any running jobs and also enables an elastic cluster with a fixed base number of instances. + +The value of the auto scaling is the same for HPC as with any other workloads, the only difference here is cfncluster has code to specifically make it interact in a more intelligent manner. If a static cluster is required, this can be achieved by setting initial_queue_size and max_queue_size parameters to the size of cluster required and also setting the maintain_initial_size parameter to true. This will cause the ComputeFleet ASG to have the same value for minimum, maximum and desired capacity. + +[1] http://docs.aws.amazon.com/AmazonCloudWatch/latest/DeveloperGuide/publishingMetrics.html +[2] http://docs.aws.amazon.com/AutoScaling/latest/DeveloperGuide/AutoScalingBehavior.InstanceTermination.html +[3] https://github.com/awslabs/cfncluster/tree/master/node/src/nodewatcher +[4] http://docs.aws.amazon.com/AutoScaling/latest/APIReference/API_TerminateInstanceInAutoScalingGroup.html \ No newline at end of file diff --git a/node/nodewatcher/__init__.py b/node/nodewatcher/__init__.py new file mode 100644 index 0000000000..717a83af1e --- /dev/null +++ b/node/nodewatcher/__init__.py @@ -0,0 +1,10 @@ +# Copyright 2013-2014 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Amazon Software License (the "License"). You may not use this file except in compliance with the +# License. A copy of the License is located at +# +# http://aws.amazon.com/asl/ +# +# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES +# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and +# limitations under the License. diff --git a/node/src/nodewatcher/nodewatcher.cfg b/node/nodewatcher/nodewatcher.cfg similarity index 100% rename from node/src/nodewatcher/nodewatcher.cfg rename to node/nodewatcher/nodewatcher.cfg diff --git a/node/src/nodewatcher/nodewatcher.py b/node/nodewatcher/nodewatcher.py similarity index 87% rename from node/src/nodewatcher/nodewatcher.py rename to node/nodewatcher/nodewatcher.py index 848066f586..ea5b584a24 100755 --- a/node/src/nodewatcher/nodewatcher.py +++ b/node/nodewatcher/nodewatcher.py @@ -28,13 +28,14 @@ def getConfig(instance_id): print('running getConfig') config = ConfigParser.RawConfigParser() - config.read('nodewatcher.cfg') + config.read('/etc/nodewatcher.cfg') _region = config.get('nodewatcher', 'region') _scheduler = config.get('nodewatcher', 'scheduler') try: _asg = config.get('nodewatcher', 'asg') except ConfigParser.NoOptionError: - conn = boto.ec2.connect_to_region(_region) + conn = boto.ec2.connect_to_region(_region,proxy=boto.config.get('Boto', 'proxy'), + proxy_port=boto.config.get('Boto', 'proxy_port')) _asg = conn.get_all_instances(instance_ids=instance_id)[0].instances[0].tags['aws:autoscaling:groupName'] config.set('nodewatcher', 'asg', _asg) @@ -83,7 +84,7 @@ def getHostname(): def loadSchedulerModule(scheduler): print 'running loadSchedulerModule' - scheduler = 'plugins.' + scheduler + scheduler = 'nodewatcher.plugins.' + scheduler _scheduler = __import__(scheduler) _scheduler = sys.modules[scheduler] @@ -97,13 +98,14 @@ def getJobs(s,hostname): def selfTerminate(asg): - _as_conn = boto.ec2.autoscale.connect_to_region(region) + _as_conn = boto.ec2.autoscale.connect_to_region(region,proxy=boto.config.get('Boto', 'proxy'), + proxy_port=boto.config.get('Boto', 'proxy_port')) _asg = _as_conn.get_all_groups(names=[asg])[0] _capacity = _asg.desired_capacity if _capacity > 0: _as_conn.terminate_instance(instance_id, decrement_capacity=True) -if __name__ == "__main__": +def main(): print('Running __main__') instance_id = getInstanceId() hostname = getHostname() @@ -125,3 +127,6 @@ def selfTerminate(asg): if hour_percentile > 95: selfTerminate(asg) + +if __name__ == "__main__": + main() diff --git a/node/src/nodewatcher/plugins/__init__.py b/node/nodewatcher/plugins/__init__.py similarity index 100% rename from node/src/nodewatcher/plugins/__init__.py rename to node/nodewatcher/plugins/__init__.py diff --git a/node/src/nodewatcher/plugins/openlava.py b/node/nodewatcher/plugins/openlava.py similarity index 100% rename from node/src/nodewatcher/plugins/openlava.py rename to node/nodewatcher/plugins/openlava.py diff --git a/node/src/nodewatcher/plugins/sge.py b/node/nodewatcher/plugins/sge.py similarity index 100% rename from node/src/nodewatcher/plugins/sge.py rename to node/nodewatcher/plugins/sge.py diff --git a/node/src/nodewatcher/plugins/test.py b/node/nodewatcher/plugins/test.py similarity index 100% rename from node/src/nodewatcher/plugins/test.py rename to node/nodewatcher/plugins/test.py diff --git a/node/src/nodewatcher/plugins/torque.py b/node/nodewatcher/plugins/torque.py similarity index 100% rename from node/src/nodewatcher/plugins/torque.py rename to node/nodewatcher/plugins/torque.py diff --git a/node/setup.py b/node/setup.py new file mode 100644 index 0000000000..568dd750f9 --- /dev/null +++ b/node/setup.py @@ -0,0 +1,56 @@ +# Copyright 2013-2014 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Amazon Software License (the "License"). You may not use this file except in compliance with the +# License. A copy of the License is located at +# +# http://aws.amazon.com/asl/ +# +# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES +# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and +# limitations under the License. + +import os, sys +from setuptools import setup, find_packages + +# Utility function to read the README file. +# Used for the long_description. It's nice, because now 1) we have a top level +# README file and 2) it's easier to type in the README file than to put a raw +# string in below ... +def read(fname): + return open(os.path.join(os.path.dirname(__file__), fname)).read() + +console_scripts = ['sqswatcher = sqswatcher.sqswatcher:main', + 'nodewatcher = nodewatcher.nodewatcher:main'] +version = "0.0.2" +requires = ['boto>=2.34', 'paramiko', 'python-dateutil'] + +if sys.version_info[:2] == (2, 6): + # For python2.6 we have to require argparse since it + # was not in stdlib until 2.7. + requires.append('argparse>=1.1') + +setup( + name = "cfncluster-node", + version = version, + author = "Dougal Ballantyne", + author_email = "dougalb@amazon.com", + description = ("cfncluster-node provides the scripts for a cfncluster node."), + url = ("https://github.com/awslabs/cfncluster"), + license = "Amazon Software License", + packages = find_packages(), + install_requires = requires, + entry_points=dict(console_scripts=console_scripts), + include_package_data = True, + zip_safe = False, + package_data = { + '' : ['examples/config'], + }, + long_description=read('README'), + classifiers=[ + "Development Status :: 4 - Beta", + "Environment :: Console", + "Programming Language :: Python", + "Topic :: Scientific/Engineering", + "License :: Other/Proprietary License", + ], +) diff --git a/node/sqswatcher/__init__.py b/node/sqswatcher/__init__.py new file mode 100644 index 0000000000..717a83af1e --- /dev/null +++ b/node/sqswatcher/__init__.py @@ -0,0 +1,10 @@ +# Copyright 2013-2014 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Amazon Software License (the "License"). You may not use this file except in compliance with the +# License. A copy of the License is located at +# +# http://aws.amazon.com/asl/ +# +# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES +# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and +# limitations under the License. diff --git a/node/src/sqswatcher/plugins/__init__.py b/node/sqswatcher/plugins/__init__.py similarity index 100% rename from node/src/sqswatcher/plugins/__init__.py rename to node/sqswatcher/plugins/__init__.py diff --git a/node/src/sqswatcher/plugins/openlava.py b/node/sqswatcher/plugins/openlava.py similarity index 100% rename from node/src/sqswatcher/plugins/openlava.py rename to node/sqswatcher/plugins/openlava.py diff --git a/node/src/sqswatcher/plugins/sge.py b/node/sqswatcher/plugins/sge.py similarity index 100% rename from node/src/sqswatcher/plugins/sge.py rename to node/sqswatcher/plugins/sge.py diff --git a/node/src/sqswatcher/plugins/test.py b/node/sqswatcher/plugins/test.py similarity index 100% rename from node/src/sqswatcher/plugins/test.py rename to node/sqswatcher/plugins/test.py diff --git a/node/src/sqswatcher/plugins/torque.py b/node/sqswatcher/plugins/torque.py similarity index 100% rename from node/src/sqswatcher/plugins/torque.py rename to node/sqswatcher/plugins/torque.py diff --git a/node/src/sqswatcher/sqswatcher.cfg b/node/sqswatcher/sqswatcher.cfg similarity index 100% rename from node/src/sqswatcher/sqswatcher.cfg rename to node/sqswatcher/sqswatcher.cfg diff --git a/node/src/sqswatcher/sqswatcher.py b/node/sqswatcher/sqswatcher.py similarity index 83% rename from node/src/sqswatcher/sqswatcher.py rename to node/sqswatcher/sqswatcher.py index c4c0815ea5..eff4b13522 100755 --- a/node/src/sqswatcher/sqswatcher.py +++ b/node/sqswatcher/sqswatcher.py @@ -32,7 +32,7 @@ def getConfig(): print('running getConfig') config = ConfigParser.RawConfigParser() - config.read('sqswatcher.cfg') + config.read('/etc/sqswatcher.cfg') _region = config.get('sqswatcher', 'region') _sqsqueue = config.get('sqswatcher', 'sqsqueue') _table_name = config.get('sqswatcher', 'table_name') @@ -45,7 +45,8 @@ def getConfig(): def setupQueue(region, sqsqueue): print('running setupQueue') - conn = boto.sqs.connect_to_region(region) + conn = boto.sqs.connect_to_region(region,proxy=boto.config.get('Boto', 'proxy'), + proxy_port=boto.config.get('Boto', 'proxy_port')) _q = conn.get_queue(sqsqueue) if _q != None: @@ -56,10 +57,12 @@ def setupQueue(region, sqsqueue): def setupDDBTable(region, table_name): print('running setupDDBTable') - conn = boto.dynamodb.connect_to_region(region) + conn = boto.dynamodb.connect_to_region(region,proxy=boto.config.get('Boto', 'proxy'), + proxy_port=boto.config.get('Boto', 'proxy_port')) tables = conn.list_tables() check = [t for t in tables if t == table_name] - conn = boto.dynamodb2.connect_to_region(region) + conn = boto.dynamodb2.connect_to_region(region,proxy=boto.config.get('Boto', 'proxy'), + proxy_port=boto.config.get('Boto', 'proxy_port')) if check: _table = Table(table_name,connection=conn) else: @@ -73,14 +76,14 @@ def setupDDBTable(region, table_name): def loadSchedulerModule(scheduler): print 'running loadSchedulerModule' - scheduler = 'plugins.' + scheduler + scheduler = 'sqswatcher.plugins.' + scheduler _scheduler = __import__(scheduler) _scheduler = sys.modules[scheduler] return _scheduler -def pollQueue(): +def pollQueue(scheduler, q, t): print 'running pollQueue' s = loadSchedulerModule(scheduler) @@ -105,7 +108,8 @@ def pollQueue(): print eventType, instanceId ec2 = boto.connect_ec2() - ec2 = boto.ec2.connect_to_region(region) + ec2 = boto.ec2.connect_to_region(region,proxy=boto.config.get('Boto', 'proxy'), + proxy_port=boto.config.get('Boto', 'proxy_port')) retry = 0 wait = 15 @@ -151,10 +155,14 @@ def pollQueue(): time.sleep(30) -if __name__ == "__main__": +def main(): print('running __main__') print time.ctime() + global region, cluster_user region, sqsqueue, table_name, scheduler, cluster_user = getConfig() q = setupQueue(region, sqsqueue) t = setupDDBTable(region, table_name) - pollQueue() \ No newline at end of file + pollQueue(scheduler, q, t) + +if __name__ == "__main__": + main()