Skip to content

Commit

Permalink
Merge pull request #1930 from Yelp/u/ddelnano/have-check_registered_s…
Browse files Browse the repository at this point in the history
…laves_aws-ignore-recently-created-asgs-sfrs

Update check_registered_slaves_aws.py ignore recently created asgs and sfrs
  • Loading branch information
ddelnano committed Aug 17, 2018
2 parents 62987e8 + a601120 commit 0b42232
Show file tree
Hide file tree
Showing 3 changed files with 126 additions and 25 deletions.
35 changes: 32 additions & 3 deletions paasta_tools/autoscaling/autoscaling_cluster_lib.py
Expand Up @@ -21,6 +21,7 @@
from collections import namedtuple
from datetime import datetime
from datetime import timedelta
from datetime import timezone
from math import ceil
from math import floor
from typing import Any
Expand Down Expand Up @@ -149,18 +150,19 @@ def __init__(
self.enable_maintenance_reservation = enable_maintenance_reservation

self.log.info('Initialized with utilization error %s' % self.utilization_error)
self.setup_metrics()
config = load_system_paasta_config()
self.slave_newness_threshold = config.get_monitoring_config().get('check_registered_slave_threshold')
self.setup_metrics(config)

@property
def log(self) -> logging.Logger:
resource_id = self.resource.get("id", "unknown")
name = '.'.join([__name__, self.__class__.__name__, resource_id])
return logging.getLogger(name)

def setup_metrics(self) -> None:
def setup_metrics(self, config: SystemPaastaConfig) -> None:
if not self.enable_metrics:
return None
config = load_system_paasta_config()
dims = {
'paasta_cluster': config.get_cluster(),
'region': self.resource.get('region', 'unknown'),
Expand Down Expand Up @@ -237,6 +239,9 @@ def describe_instances(
instances = [instance for reservation in instance_reservations for instance in reservation]
return instances

def is_new_autoscaling_resource(self) -> bool:
raise NotImplementedError()

def describe_instance_status(
self,
instance_ids: List[str],
Expand Down Expand Up @@ -800,6 +805,18 @@ def get_sfr(
ret = sfrs['SpotFleetRequestConfigs'][0]
return ret

def is_new_autoscaling_resource(self) -> bool:
"""
Determines if the spot fleet request was created recently as defined by
CHECK_REGISTERED_SLAVE_THRESHOLD.
"""
if not self.sfr:
self.log.warn('Cannot find SFR {}'.format(self.resource['id']))
return True

now = datetime.now(timezone.utc)
return (now - self.sfr['CreateTime']).total_seconds() < self.slave_newness_threshold

def get_spot_fleet_instances(
self,
spotfleet_request_id: str,
Expand Down Expand Up @@ -971,6 +988,18 @@ def __init__(self, *args: Any, **kwargs: Any) -> None:
def exists(self) -> bool:
return True if self.asg else False

def is_new_autoscaling_resource(self) -> bool:
"""
Determines if an autoscaling group was created recently as defined by
CHECK_REGISTERED_SLAVE_THRESHOLD.
"""
if not self.asg:
self.log.warning("ASG {} not found, removing config file".format(self.resource['id']))
return True

now = datetime.now(timezone.utc)
return (now - self.asg['CreatedTime']).total_seconds() < self.slave_newness_threshold

def get_asg(self, asg_name: str, region: Optional[str]=None) -> Optional[Dict[str, Any]]:
asg_client = boto3.client('autoscaling', region_name=region)
asgs = asg_client.describe_auto_scaling_groups(AutoScalingGroupNames=[asg_name])
Expand Down
11 changes: 10 additions & 1 deletion paasta_tools/contrib/check_registered_slaves_aws.py
Expand Up @@ -17,7 +17,8 @@ def check_registration(threshold_percentage):
print("Could not find Mesos Master: %s" % e.message)
sys.exit(1)

autoscaling_resources = load_system_paasta_config().get_cluster_autoscaling_resources()
config = load_system_paasta_config()
autoscaling_resources = config.get_cluster_autoscaling_resources()
for resource in autoscaling_resources.values():
print("Checking %s" % resource['id'])
try:
Expand All @@ -36,6 +37,14 @@ def check_registration(threshold_percentage):
if len(scaler.instances) == 0:
print("No instances for this resource")
continue
elif scaler.is_new_autoscaling_resource():
# See OPS-13784
threshold = config.get_monitoring_config().get('check_registered_slave_threshold')
print(
f"Autoscaling resource was created within last {threshold}"
" seconds and would probably fail this check",
)
continue
else:
slaves = scaler.get_aws_slaves(mesos_state)
percent_registered = float(float(len(slaves)) / float(len(scaler.instances))) * 100
Expand Down
105 changes: 84 additions & 21 deletions tests/autoscaling/test_autoscaling_cluster_lib.py
Expand Up @@ -15,6 +15,9 @@
import contextlib
import unittest
import warnings
from datetime import datetime
from datetime import timedelta
from datetime import timezone
from math import ceil
from math import floor

Expand All @@ -28,6 +31,7 @@
from paasta_tools.autoscaling import autoscaling_cluster_lib
from paasta_tools.mesos_tools import SlaveTaskCount
from paasta_tools.metrics.metastatus_lib import ResourceInfo
from paasta_tools.utils import SystemPaastaConfig
from paasta_tools.utils import TimeoutError


Expand Down Expand Up @@ -495,22 +499,29 @@ def setUp(self):
self.autoscaler = self.create_autoscaler()

def create_autoscaler(self, utilization_error=0.3, resource=None, asg=None):
config = SystemPaastaConfig({'monitoring_config': {'check_registered_slave_threshold': 3600}}, '/etc/paasta')
with mock.patch(
'paasta_tools.autoscaling.autoscaling_cluster_lib.AsgAutoscaler.get_asg',
autospec=True,
return_value=asg or {},
):
autoscaler = autoscaling_cluster_lib.AsgAutoscaler(
resource=resource or self.mock_resource,
pool_settings=self.mock_pool_settings,
config_folder=self.mock_config_folder,
dry_run=False,
utilization_error=utilization_error,
max_increase=0.2,
max_decrease=0.1,
)
autoscaler.instances = []
return autoscaler
with mock.patch(
'paasta_tools.autoscaling.autoscaling_cluster_lib.load_system_paasta_config',
autospec=True,
return_value=config,
):
print(config.get_monitoring_config())
autoscaler = autoscaling_cluster_lib.AsgAutoscaler(
resource=resource or self.mock_resource,
pool_settings=self.mock_pool_settings,
config_folder=self.mock_config_folder,
dry_run=False,
utilization_error=utilization_error,
max_increase=0.2,
max_decrease=0.1,
)
autoscaler.instances = []
return autoscaler

def create_mock_resource(self, **kwargs):
mock_resource = self.mock_resource.copy()
Expand All @@ -535,6 +546,24 @@ def test_is_asg_cancelled(self):
self.autoscaler.asg = mock.Mock()
assert not self.autoscaler.is_resource_cancelled()

def test_is_new_autoscaling_resource_when_asg_is_above_threshold(self):
asg = {
'Instances': [mock.Mock()],
'CreatedTime': datetime.now(timezone.utc) - timedelta(
seconds=3600 + 60,
),
}
autoscaler = self.create_autoscaler(asg=asg)
assert not autoscaler.is_new_autoscaling_resource()

def test_is_new_autoscaling_resource_when_asg_is_below_threshold(self):
asg = {
'Instances': [mock.Mock()],
'CreatedTime': datetime.now(timezone.utc),
}
autoscaler = self.create_autoscaler(asg=asg)
assert autoscaler.is_new_autoscaling_resource()

def test_get_asg(self):
with mock.patch('boto3.client', autospec=True) as mock_ec2_client:
mock_asg = mock.Mock()
Expand Down Expand Up @@ -743,16 +772,25 @@ def create_autoscaler(self, utilization_error=0.3, resource=None, sfr=None):
) as mock_get_spot_fleet_instances:
mock_get_sfr.return_value = sfr or {}
mock_get_spot_fleet_instances.return_value = []

return autoscaling_cluster_lib.SpotAutoscaler(
resource=resource or self.mock_resource,
pool_settings=self.mock_pool_settings,
config_folder=self.mock_config_folder,
dry_run=False,
utilization_error=utilization_error,
max_increase=0.2,
max_decrease=0.1,
)
with mock.patch(
'paasta_tools.autoscaling.autoscaling_cluster_lib.load_system_paasta_config',
autospec=True,
return_value=SystemPaastaConfig(
{
'monitoring_config': {'check_registered_slave_threshold': 3600},
}, '/etc/paasta',
),
):

return autoscaling_cluster_lib.SpotAutoscaler(
resource=resource or self.mock_resource,
pool_settings=self.mock_pool_settings,
config_folder=self.mock_config_folder,
dry_run=False,
utilization_error=utilization_error,
max_increase=0.2,
max_decrease=0.1,
)

def create_mock_resource(self, **kwargs):
mock_resource = self.mock_resource.copy()
Expand Down Expand Up @@ -854,6 +892,28 @@ def test_get_sfr(self):
ret = self.autoscaler.get_sfr('sfr-blah', region='westeros-1')
assert ret is None

def test_is_new_autoscaling_resource_when_sfr_is_above_threshold(self):
sfr = {
'SpotFleetRequestConfig': {'FulfilledCapacity': 2},
'SpotFleetRequestState': 'active',
'Instances': [mock.Mock()],
'CreateTime': datetime.now(timezone.utc) - timedelta(
seconds=3600 + 60,
),
}
autoscaler = self.create_autoscaler(sfr=sfr)
assert not autoscaler.is_new_autoscaling_resource()

def test_is_new_autoscaling_resource_when_sfr_is_below_threshold(self):
sfr = {
'SpotFleetRequestConfig': {'FulfilledCapacity': 2},
'SpotFleetRequestState': 'active',
'Instances': [mock.Mock()],
'CreateTime': datetime.now(timezone.utc),
}
autoscaler = self.create_autoscaler(sfr=sfr)
assert autoscaler.is_new_autoscaling_resource()

def test_set_spot_fleet_request_capacity(self):
with mock.patch(
'boto3.client', autospec=True,
Expand Down Expand Up @@ -1166,6 +1226,9 @@ def test_emit_metrics(self):
self.autoscaler.aws_instances_gauge.set.assert_called_once_with(0)
self.autoscaler.mesos_slaves_gauge.set.assert_called_once_with(5)

def test_is_new_autoscaling_resource(self):
self.assertRaises(NotImplementedError, self.autoscaler.is_new_autoscaling_resource)

def test_describe_instance(self):
with mock.patch('boto3.client', autospec=True) as mock_ec2_client:
mock_instance_1 = mock.Mock()
Expand Down

0 comments on commit 0b42232

Please sign in to comment.