diff --git a/codeguru_profiler_agent/agent_metadata/agent_debug_info.py b/codeguru_profiler_agent/agent_metadata/agent_debug_info.py new file mode 100644 index 0000000..b3f8112 --- /dev/null +++ b/codeguru_profiler_agent/agent_metadata/agent_debug_info.py @@ -0,0 +1,124 @@ +import logging +import os + +from codeguru_profiler_agent.utils.synchronization import synchronized +from codeguru_profiler_agent.utils.time import to_iso + +logger = logging.getLogger(__name__) + + +class ErrorsMetadata: + def __init__(self): + self.reset() + + def reset(self): + """ + We want to differentiate API call errors more granularly. We want to gather ResourceNotFoundException errors + because we are going to get this exception with auto-create feature and want to monitor how many times + the agent is not able to create the PG and resulting in subsequent ResourceNotFoundException. + """ + self.errors_count = 0 + self.sdk_client_errors = 0 + self.configure_agent_errors = 0 + self.configure_agent_rnfe_auto_create_enabled_errors = 0 + self.create_profiling_group_errors = 0 + self.post_agent_profile_errors = 0 + self.post_agent_profile_rnfe_auto_create_enabled_errors = 0 + + def serialize_to_json(self): + """ + This needs to be compliant with errors count schema. + """ + return { + "errorsCount": self.errors_count, + "sdkClientErrors": self.sdk_client_errors, + "configureAgentErrors": self.configure_agent_errors, + "configureAgentRnfeAutoCreateEnabledErrors": self.configure_agent_rnfe_auto_create_enabled_errors, + "createProfilingGroupErrors": self.create_profiling_group_errors, + "postAgentProfileErrors": self.post_agent_profile_errors, + "postAgentProfileRnfeAutoCreateEnabledErrors": self.post_agent_profile_rnfe_auto_create_enabled_errors + } + + @synchronized + def increment_sdk_error(self, error_type): + """ + ErrorsCount is the umbrella of all the kinds of error we want to capture. Currently we have only SdkClientErrors + in it. SdkClientErrors is comprised of different API level errors like ConfigureAgentErrors, + PostAgentProfileErrors, CreateProfilingGroupErrors. + :param error_type: The type of API level error that we want to capture. + """ + self.errors_count += 1 + self.sdk_client_errors += 1 + + """ + Special handling for ResourceNotFoundException errors. + For example configureAgentRnfeAutoCreateEnabledErrors is also a configureAgentErrors. + """ + if error_type == "configureAgentErrors": + self.configure_agent_errors += 1 + elif error_type == "configureAgentRnfeAutoCreateEnabledErrors": + self.configure_agent_errors += 1 + self.configure_agent_rnfe_auto_create_enabled_errors += 1 + elif error_type == "createProfilingGroupErrors": + self.create_profiling_group_errors += 1 + elif error_type == "postAgentProfileErrors": + self.post_agent_profile_errors += 1 + elif error_type == "postAgentProfileRnfeAutoCreateEnabledErrors": + self.post_agent_profile_errors += 1 + self.post_agent_profile_rnfe_auto_create_enabled_errors += 1 + + def record_sdk_error(self, error_type): + self.increment_sdk_error(error_type) + + +class AgentDebugInfo: + def __init__(self, errors_metadata=None, agent_start_time=None, timer=None): + self.process_id = get_process_id() + self.errors_metadata = errors_metadata + self.agent_start_time = agent_start_time + self.timer = timer + + def serialize_to_json(self): + """ + This needs to be compliant with agent debug info schema. + """ + json = {} + + self.add_agent_start_time(json) + self.add_process_id(json) + self.add_errors_metadata(json) + self.add_generic_metrics(json) + + return json + + def add_agent_start_time(self, json): + if self.agent_start_time is not None: + json["agentStartTime"] = to_iso(self.agent_start_time) + + def add_errors_metadata(self, json): + if self.errors_metadata is not None: + json["errorsCount"] = self.errors_metadata.serialize_to_json() + + def add_process_id(self, json): + if self.process_id is not None: + json["processId"] = self.process_id + + def add_generic_metrics(self, json): + if self.timer is not None and self.timer.metrics: + generic_metrics = {} + + for metric, metric_value in self.timer.metrics.items(): + generic_metrics[metric + "_timings_max"] = metric_value.max + generic_metrics[metric + "_timings_average"] = metric_value.average() + + if generic_metrics: + json["genericMetrics"] = generic_metrics + + +def get_process_id(): + try: + return os.getpid() + except Exception as e: + logger.info("Failed to get the process id", exc_info=True) + return None + diff --git a/codeguru_profiler_agent/local_aggregator.py b/codeguru_profiler_agent/local_aggregator.py index aa31ea6..1091c9b 100644 --- a/codeguru_profiler_agent/local_aggregator.py +++ b/codeguru_profiler_agent/local_aggregator.py @@ -2,6 +2,7 @@ import time import datetime +from codeguru_profiler_agent.agent_metadata.agent_debug_info import AgentDebugInfo from codeguru_profiler_agent.reporter.agent_configuration import AgentConfiguration from codeguru_profiler_agent.metrics.with_timer import with_timer from codeguru_profiler_agent.model.profile import Profile @@ -30,6 +31,7 @@ def __init__(self, reporter, environment=dict()): :param host_weight: (required inside environment) scale factor used to rescale the profile collected in this host to make the profile representative of the whole fleet :param timer: (required inside environment) timer to be used for metrics + :param errors_metadata: (required inside environment) metadata capturing errors in the current profile. :param profile_factory: (inside environment) the factory to created profiler; default Profile. :param clock: (inside environment) clock to be used; default is time.time """ @@ -37,6 +39,7 @@ def __init__(self, reporter, environment=dict()): self.profiling_group_name = environment["profiling_group_name"] self.host_weight = environment["host_weight"] self.timer = environment["timer"] + self.errors_metadata = environment["errors_metadata"] self.profile_factory = environment.get("profile_factory") or Profile self.clock = environment.get("clock") or time.time @@ -44,6 +47,7 @@ def __init__(self, reporter, environment=dict()): self.profile = None self.memory_limit_bytes = environment["memory_limit_bytes"] self.last_report_attempted = current_milli_time(clock=self.clock) + self.agent_start_time = current_milli_time(clock=self.clock) self.reset() @@ -71,14 +75,16 @@ def _check_memory_limit(self): self.flush(force=True) def reset(self): + self.errors_metadata.reset() + self.timer.reset() self.profile = self.profile_factory( profiling_group_name=self.profiling_group_name, sampling_interval_seconds=AgentConfiguration.get().sampling_interval.total_seconds(), host_weight=self.host_weight, start=current_milli_time(clock=self.clock), + agent_debug_info=AgentDebugInfo(self.errors_metadata, self.agent_start_time, self.timer), clock=self.clock ) - self.timer.reset() @with_timer("flush") def flush(self, force=False, reset=True): diff --git a/codeguru_profiler_agent/model/profile.py b/codeguru_profiler_agent/model/profile.py index ba731c9..82f5cb3 100644 --- a/codeguru_profiler_agent/model/profile.py +++ b/codeguru_profiler_agent/model/profile.py @@ -11,7 +11,7 @@ class Profile: - def __init__(self, profiling_group_name, sampling_interval_seconds, host_weight, start, clock=time.time): + def __init__(self, profiling_group_name, sampling_interval_seconds, host_weight, start, agent_debug_info, clock=time.time): """ A profile holds the root node of the call graph and the metadata related to the profile """ @@ -35,6 +35,7 @@ def __init__(self, profiling_group_name, sampling_interval_seconds, host_weight, self.host_weight = int(host_weight) self._start_process_time = time.process_time() # provides process time in fractional seconds as float. self.overhead_ms = 0 + self.agent_debug_info = agent_debug_info @property def end(self): @@ -97,6 +98,9 @@ def _insert_stack(self, stack, runnable_count_increase=1): def get_memory_usage_bytes(self): return self.memory_counter.get_memory_usage_bytes() + def serialize_agent_debug_info_to_json(self): + return self.agent_debug_info.serialize_to_json() + def pause(self): if self.last_pause is not None: # pause gets called when profile is paused diff --git a/codeguru_profiler_agent/profiler.py b/codeguru_profiler_agent/profiler.py index 854f07c..5f7c8bb 100644 --- a/codeguru_profiler_agent/profiler.py +++ b/codeguru_profiler_agent/profiler.py @@ -7,6 +7,8 @@ from datetime import timedelta from random import SystemRandom from types import MappingProxyType as UnmodifiableDict + +from codeguru_profiler_agent.agent_metadata.agent_debug_info import ErrorsMetadata from codeguru_profiler_agent.agent_metadata.agent_metadata import AgentMetadata from codeguru_profiler_agent.profiler_disabler import ProfilerDisabler from codeguru_profiler_agent.reporter.agent_configuration import AgentConfiguration, AgentConfigurationMerger @@ -167,6 +169,7 @@ def _setup_final_environment(self, environment, environment_override): frozenset({environment['profiler_thread_name']}.union(environment['excluded_threads'])) # TODO delay metadata lookup until we need it environment['agent_metadata'] = environment.get('agent_metadata') or AgentMetadata() + environment['errors_metadata'] = environment.get('errors_metadata') or ErrorsMetadata() environment['collector'] = environment.get('collector') or self._select_collector(environment) environment["profiler_disabler"] = environment.get('profiler_disabler') or ProfilerDisabler(environment) return UnmodifiableDict(environment) diff --git a/codeguru_profiler_agent/sdk_reporter/profile_encoder.py b/codeguru_profiler_agent/sdk_reporter/profile_encoder.py index 315f3bf..dcc82d7 100644 --- a/codeguru_profiler_agent/sdk_reporter/profile_encoder.py +++ b/codeguru_profiler_agent/sdk_reporter/profile_encoder.py @@ -99,11 +99,15 @@ def encode_content(self): "start": int(self._profile.start), "end": int(self._profile.end), "agentMetadata": self._encode_agent_metadata(), - "callgraph": self._encode_call_graph(self._profile.callgraph) + "callgraph": self._encode_call_graph(self._profile.callgraph), + "debugInfo": self._encode_debug_info() } return json.dumps(profile_in_map) + def _encode_debug_info(self): + return self._profile.serialize_agent_debug_info_to_json() + def _encode_agent_metadata(self): profile_duration_seconds = self._profile.get_active_millis_since_start() / 1000.0 sample_weight = 1.0 if (profile_duration_seconds == 0) else self._profile.total_sample_count / profile_duration_seconds diff --git a/codeguru_profiler_agent/sdk_reporter/sdk_reporter.py b/codeguru_profiler_agent/sdk_reporter/sdk_reporter.py index 5df5e01..43e10cb 100644 --- a/codeguru_profiler_agent/sdk_reporter/sdk_reporter.py +++ b/codeguru_profiler_agent/sdk_reporter/sdk_reporter.py @@ -15,12 +15,14 @@ logger = logging.getLogger(__name__) AWS_EXECUTION_ENV_KEY = "AWS_EXECUTION_ENV" + class SdkReporter(Reporter): """ Handles communication with the CodeGuru Profiler Service backend. Encodes profiles using the ProfilerEncoder and reports them using the CodeGuru profiler SDK. """ is_create_pg_called_during_submit_profile = False + def __init__(self, environment): """ :param environment: dependency container dictionary for the current profiler. @@ -35,6 +37,7 @@ def __init__(self, environment): self.timer = environment.get("timer") self.metadata = environment["agent_metadata"] self.agent_config_merger = environment["agent_config_merger"] + self.errors_metadata = environment["errors_metadata"] def _encode_profile(self, profile): output_profile_stream = io.BytesIO() @@ -76,10 +79,12 @@ def refresh_configuration(self): # We handle service exceptions like this in boto3 # see https://boto3.amazonaws.com/v1/documentation/api/latest/guide/error-handling.html if error.response['Error']['Code'] == 'ValidationException': + self.errors_metadata.record_sdk_error("configureAgentErrors") self.agent_config_merger.disable_profiling() self._log_request_failed(operation="configure_agent", exception=error) - if error.response['Error']['Code'] == 'ResourceNotFoundException': + elif error.response['Error']['Code'] == 'ResourceNotFoundException': if self.should_auto_create_profiling_group(): + self.errors_metadata.record_sdk_error("configureAgentRnfeAutoCreateEnabledErrors") logger.info( "Profiling group not found. Will try to create a profiling group " "with name = {} and compute platform = {} and retry calling configure agent after 5 minutes. " @@ -87,7 +92,10 @@ def refresh_configuration(self): .format(self.profiling_group_name, 'AWSLambda')) self.create_profiling_group() else: + self.errors_metadata.record_sdk_error("configureAgentErrors") self.agent_config_merger.disable_profiling() + else: + self.errors_metadata.record_sdk_error("configureAgentErrors") except Exception as e: self._log_request_failed(operation="configure_agent", exception=e) @@ -117,12 +125,17 @@ def report(self, profile): if error.response['Error']['Code'] == 'ResourceNotFoundException': if self.should_auto_create_profiling_group(): self.__class__.is_create_pg_called_during_submit_profile = True + self.errors_metadata.record_sdk_error("postAgentProfileRnfeAutoCreateEnabledErrors") logger.info( "Profiling group not found. Will try to create a profiling group " "with name = {} and compute platform = {} and retry reporting during next invocation. " "Make sure that Lambda's execution role has AmazonCodeGuruProfilerAgentAccess policy added." .format(self.profiling_group_name, 'AWSLambda')) self.create_profiling_group() + else: + self.errors_metadata.record_sdk_error("postAgentProfileErrors") + else: + self.errors_metadata.record_sdk_error("postAgentProfileErrors") return False except Exception as e: self._log_request_failed(operation="post_agent_profile", exception=e) @@ -143,6 +156,8 @@ def create_profiling_group(self): if error.response['Error']['Code'] == 'ConflictException': logger.info("Profiling Group with name {} already exists. Please use a different name." .format(self.profiling_group_name)) + else: + self.errors_metadata.record_sdk_error("createProfilingGroupErrors") except Exception as e: self._log_request_failed(operation="create_profiling_group", exception=e) diff --git a/codeguru_profiler_agent/utils/synchronization.py b/codeguru_profiler_agent/utils/synchronization.py new file mode 100644 index 0000000..37460be --- /dev/null +++ b/codeguru_profiler_agent/utils/synchronization.py @@ -0,0 +1,15 @@ +import functools +import threading + + +def synchronized(wrapped): + """The missing @synchronized decorator + + https://git.io/vydTA""" + _lock = threading.RLock() + + @functools.wraps(wrapped) + def _wrapper(*args, **kwargs): + with _lock: + return wrapped(*args, **kwargs) + return _wrapper diff --git a/codeguru_profiler_agent/utils/time.py b/codeguru_profiler_agent/utils/time.py index 5ebb3ec..a6f4095 100644 --- a/codeguru_profiler_agent/utils/time.py +++ b/codeguru_profiler_agent/utils/time.py @@ -3,11 +3,14 @@ import time from datetime import datetime + def to_iso(epoch_milli): try: - return datetime.fromtimestamp(epoch_milli / 1000).isoformat() + return datetime.utcfromtimestamp(epoch_milli / 1000).isoformat( + timespec='milliseconds') + "Z" # ISO 8601 date-time format except ValueError: return str(epoch_milli) + def current_milli_time(clock=time.time): return int(clock() * 1000) diff --git a/test/integration/test_live_backend_reporting.py b/test/integration/test_live_backend_reporting.py index a79f231..832bbc1 100644 --- a/test/integration/test_live_backend_reporting.py +++ b/test/integration/test_live_backend_reporting.py @@ -5,6 +5,7 @@ from datetime import timedelta +from codeguru_profiler_agent.agent_metadata.agent_debug_info import ErrorsMetadata, AgentDebugInfo from test.help_utils import MY_PROFILING_GROUP_NAME_FOR_INTEG_TESTS from test.pytestutils import before @@ -32,8 +33,10 @@ def before(self): stacks=[[Frame(MY_PROFILING_GROUP_NAME_FOR_INTEG_TESTS)]], attempted_sample_threads_count=1, seen_threads_count=1) + errors_metadata = ErrorsMetadata() - self.profile = Profile(MY_PROFILING_GROUP_NAME_FOR_INTEG_TESTS, 1.0, 1.0, five_minutes_ago_millis) + self.profile = Profile(MY_PROFILING_GROUP_NAME_FOR_INTEG_TESTS, 1.0, 1.0, five_minutes_ago_millis, + AgentDebugInfo(errors_metadata)) # FIXME: Remove adding the end time manually below after feature fully support self.profile.end = now_millis self.profile.add(sample) @@ -47,7 +50,8 @@ def before(self): "minimum_time_reporting": timedelta(minutes=6), "max_stack_depth": 2345, "cpu_limit_percentage": 29, - "agent_metadata": AgentMetadata(fleet_info=DefaultFleetInfo()) + "agent_metadata": AgentMetadata(fleet_info=DefaultFleetInfo()), + "errors_metadata": errors_metadata } self.environment["codeguru_profiler_builder"] = CodeGuruClientBuilder(self.environment) self.agent_config = AgentConfiguration( diff --git a/test/unit/agent_metadata/test_agent_debug_info.py b/test/unit/agent_metadata/test_agent_debug_info.py new file mode 100644 index 0000000..27a748f --- /dev/null +++ b/test/unit/agent_metadata/test_agent_debug_info.py @@ -0,0 +1,151 @@ +from codeguru_profiler_agent.agent_metadata.agent_debug_info import ErrorsMetadata, AgentDebugInfo +from codeguru_profiler_agent.metrics.timer import Timer + + +class TestAgentDebugInfo: + def test_it_returns_json_with_error_counts(self): + errors_metadata = ErrorsMetadata() + errors_metadata.record_sdk_error("createProfilingGroupErrors") + subject = AgentDebugInfo(errors_metadata) + + serialized_json = subject.serialize_to_json() + assert serialized_json["errorsCount"] == { + "configureAgentErrors": 0, + "configureAgentRnfeAutoCreateEnabledErrors": 0, + "createProfilingGroupErrors": 1, + "errorsCount": 1, + "postAgentProfileErrors": 0, + "postAgentProfileRnfeAutoCreateEnabledErrors": 0, + "sdkClientErrors": 1 + } + + def test_it_returns_json_with_agent_start_time(self): + subject = AgentDebugInfo(agent_start_time=1577840400000) # 2020-01-01T01:00:00Z UTC + + serialized_json = subject.serialize_to_json() + assert serialized_json["agentStartTime"] == "2020-01-01T01:00:00.000Z" + + def test_it_returns_json_with_generic_metrics(self): + timer = Timer() + timer.record("metric1", 12345000) + timer.record("metric1", 12350000) + subject = AgentDebugInfo(timer=timer) + + serialized_json = subject.serialize_to_json() + assert serialized_json["genericMetrics"] == { + "metric1_timings_max": 12350000, + "metric1_timings_average": 12347500.0 + } + + +class TestErrorsMetadata: + class TestSerializeToJson: + def test_it_returns_json_with_error_counts(self): + subject = ErrorsMetadata() + subject.record_sdk_error("configureAgentErrors") + + assert subject.serialize_to_json() == { + "errorsCount": 1, + "sdkClientErrors": 1, + "configureAgentErrors": 1, + "configureAgentRnfeAutoCreateEnabledErrors": 0, + "createProfilingGroupErrors": 0, + "postAgentProfileErrors": 0, + "postAgentProfileRnfeAutoCreateEnabledErrors": 0 + } + + class TestRecordSdkError: + def test_it_increments_error_count_when_configureAgentErrors(self): + subject = ErrorsMetadata() + subject.record_sdk_error("configureAgentErrors") + + assert subject.errors_count == 1 + assert subject.sdk_client_errors == 1 + assert subject.configure_agent_errors == 1 + assert subject.configure_agent_rnfe_auto_create_enabled_errors == 0 + assert subject.create_profiling_group_errors == 0 + assert subject.post_agent_profile_errors == 0 + assert subject.post_agent_profile_rnfe_auto_create_enabled_errors == 0 + + def test_it_increments_error_count_when_configureAgentRnfeAutoCreateEnabledErrors(self): + subject = ErrorsMetadata() + subject.record_sdk_error("configureAgentRnfeAutoCreateEnabledErrors") + + assert subject.errors_count == 1 + assert subject.sdk_client_errors == 1 + assert subject.configure_agent_errors == 1 + assert subject.configure_agent_rnfe_auto_create_enabled_errors == 1 + assert subject.create_profiling_group_errors == 0 + assert subject.post_agent_profile_errors == 0 + assert subject.post_agent_profile_rnfe_auto_create_enabled_errors == 0 + + def test_it_increments_error_count_when_createProfilingGroupErrors(self): + subject = ErrorsMetadata() + subject.record_sdk_error("createProfilingGroupErrors") + + assert subject.errors_count == 1 + assert subject.sdk_client_errors == 1 + assert subject.configure_agent_errors == 0 + assert subject.configure_agent_rnfe_auto_create_enabled_errors == 0 + assert subject.create_profiling_group_errors == 1 + assert subject.post_agent_profile_errors == 0 + assert subject.post_agent_profile_rnfe_auto_create_enabled_errors == 0 + + def test_it_increments_error_count_when_postAgentProfileErrors(self): + subject = ErrorsMetadata() + subject.record_sdk_error("postAgentProfileErrors") + + assert subject.errors_count == 1 + assert subject.sdk_client_errors == 1 + assert subject.configure_agent_errors == 0 + assert subject.configure_agent_rnfe_auto_create_enabled_errors == 0 + assert subject.create_profiling_group_errors == 0 + assert subject.post_agent_profile_errors == 1 + assert subject.post_agent_profile_rnfe_auto_create_enabled_errors == 0 + + def test_it_increments_error_count_when_postAgentProfileRnfeAutoCreateEnabledErrors(self): + subject = ErrorsMetadata() + subject.record_sdk_error("postAgentProfileRnfeAutoCreateEnabledErrors") + + assert subject.errors_count == 1 + assert subject.sdk_client_errors == 1 + assert subject.configure_agent_errors == 0 + assert subject.configure_agent_rnfe_auto_create_enabled_errors == 0 + assert subject.create_profiling_group_errors == 0 + assert subject.post_agent_profile_errors == 1 + assert subject.post_agent_profile_rnfe_auto_create_enabled_errors == 1 + + def test_it_increments_error_count_when_multiple_errors(self): + subject = ErrorsMetadata() + subject.record_sdk_error("createProfilingGroupErrors") + subject.record_sdk_error("postAgentProfileRnfeAutoCreateEnabledErrors") + + assert subject.errors_count == 2 + assert subject.sdk_client_errors == 2 + assert subject.configure_agent_errors == 0 + assert subject.configure_agent_rnfe_auto_create_enabled_errors == 0 + assert subject.create_profiling_group_errors == 1 + assert subject.post_agent_profile_errors == 1 + assert subject.post_agent_profile_rnfe_auto_create_enabled_errors == 1 + + def test_it_resets_error_count(self): + subject = ErrorsMetadata() + subject.record_sdk_error("createProfilingGroupErrors") + + assert subject.errors_count == 1 + assert subject.sdk_client_errors == 1 + assert subject.configure_agent_errors == 0 + assert subject.configure_agent_rnfe_auto_create_enabled_errors == 0 + assert subject.create_profiling_group_errors == 1 + assert subject.post_agent_profile_errors == 0 + assert subject.post_agent_profile_rnfe_auto_create_enabled_errors == 0 + + subject.reset() + + assert subject.errors_count == 0 + assert subject.sdk_client_errors == 0 + assert subject.configure_agent_errors == 0 + assert subject.configure_agent_rnfe_auto_create_enabled_errors == 0 + assert subject.create_profiling_group_errors == 0 + assert subject.post_agent_profile_errors == 0 + assert subject.post_agent_profile_rnfe_auto_create_enabled_errors == 0 diff --git a/test/unit/model/test_profile.py b/test/unit/model/test_profile.py index 667b038..07c0537 100644 --- a/test/unit/model/test_profile.py +++ b/test/unit/model/test_profile.py @@ -1,6 +1,7 @@ import pytest from unittest.mock import Mock +from codeguru_profiler_agent.agent_metadata.agent_debug_info import AgentDebugInfo, ErrorsMetadata from codeguru_profiler_agent.model.frame import Frame from test.pytestutils import before import datetime @@ -19,6 +20,7 @@ def before(self): sampling_interval_seconds=1.0, host_weight=2, start=self.test_start_time, + agent_debug_info=AgentDebugInfo(ErrorsMetadata()), clock=self.mock_clock ) @@ -292,11 +294,14 @@ def test_it_stores_valid_start_time(self): def test_it_raise_exception_for_invalid_start_time(self): with pytest.raises(ValueError): - Profile(profiling_group_name="foo", sampling_interval_seconds=1.0, host_weight=2, start=0) + Profile(profiling_group_name="foo", sampling_interval_seconds=1.0, host_weight=2, start=0, + agent_debug_info=AgentDebugInfo(ErrorsMetadata)) with pytest.raises(ValueError): - Profile(profiling_group_name="foo", sampling_interval_seconds=1.0, host_weight=2, start=-100) + Profile(profiling_group_name="foo", sampling_interval_seconds=1.0, host_weight=2, start=-100, + agent_debug_info=AgentDebugInfo(ErrorsMetadata)) with pytest.raises(TypeError): - Profile(profiling_group_name="foo", sampling_interval_seconds=1.0, host_weight=2, start=None) + Profile(profiling_group_name="foo", sampling_interval_seconds=1.0, host_weight=2, start=None, + agent_debug_info=AgentDebugInfo(ErrorsMetadata)) class TestEndTime(TestProfile): diff --git a/test/unit/sdk_reporter/test_sdk_profile_encoder.py b/test/unit/sdk_reporter/test_sdk_profile_encoder.py index f3092f2..1607b0f 100644 --- a/test/unit/sdk_reporter/test_sdk_profile_encoder.py +++ b/test/unit/sdk_reporter/test_sdk_profile_encoder.py @@ -4,6 +4,7 @@ import pytest from unittest.mock import MagicMock +from codeguru_profiler_agent.agent_metadata.agent_debug_info import ErrorsMetadata, AgentDebugInfo from codeguru_profiler_agent.agent_metadata.agent_metadata import AgentMetadata from codeguru_profiler_agent.agent_metadata.aws_ec2_instance import AWSEC2Instance from codeguru_profiler_agent.model.frame import Frame @@ -23,7 +24,7 @@ def example_profile(): start_time = 1514764800000 end_time = 1514772000000 profile = Profile(profiling_group_name="TestProfilingGroupName", sampling_interval_seconds=1.0, host_weight=2, - start=start_time, clock=lambda: 1514772000000) + start=start_time, agent_debug_info=AgentDebugInfo(errors_metadata), clock=lambda: 1514772000000) profile.add( Sample(stacks=[[Frame("bottom"), Frame("middle"), Frame("top")], [Frame("bottom"), Frame("middle"), Frame("different_top")], @@ -40,10 +41,12 @@ def example_profile(): agent_metadata = AgentMetadata( fleet_info=AWSEC2Instance(host_name="testHostName", host_type="testHostType") ) +errors_metadata = ErrorsMetadata() environment = { "timer": Timer(), - "agent_metadata": agent_metadata + "agent_metadata": agent_metadata, + "errors_metadata": errors_metadata } diff --git a/test/unit/sdk_reporter/test_sdk_reporter.py b/test/unit/sdk_reporter/test_sdk_reporter.py index 287894d..c0d71c9 100644 --- a/test/unit/sdk_reporter/test_sdk_reporter.py +++ b/test/unit/sdk_reporter/test_sdk_reporter.py @@ -5,6 +5,7 @@ from datetime import timedelta, datetime +from codeguru_profiler_agent.agent_metadata.agent_debug_info import ErrorsMetadata, AgentDebugInfo from codeguru_profiler_agent.utils.time import current_milli_time from test.pytestutils import before from unittest.mock import MagicMock @@ -22,7 +23,32 @@ profiling_group_name = "test-ProfilingGroup-name" autocreated_test_lambda_profiling_group_name = "aws-lambda-testLambdaName" -profile = Profile(profiling_group_name, 1.0, 0.5, current_milli_time()) +errors_metadata = ErrorsMetadata() +profile = Profile(profiling_group_name, 1.0, 0.5, current_milli_time(), AgentDebugInfo(errors_metadata)) +expected_response_create_pg = { + 'profilingGroup': { + 'agentOrchestrationConfig': { + 'profilingEnabled': True + }, + 'arn': 'string', + 'computePlatform': 'AWSLambda', + 'createdAt': datetime(2015, 1, 1), + 'name': 'string', + 'profilingStatus': { + 'latestAgentOrchestratedAt': datetime(2015, 1, 1), + 'latestAgentProfileReportedAt': datetime(2015, 1, 1), + 'latestAggregatedProfile': { + 'period': 'PT5M', + 'start': datetime(2015, 1, 1) + } + }, + 'tags': { + 'string': 'string' + }, + 'updatedAt': datetime(2015, 1, 1) + } +} + class TestSdkReporter: def before(self): @@ -42,6 +68,7 @@ def before(self): "profile_encoder": profile_encoder, "codeguru_profiler_builder": codeguru_client_builder, "agent_metadata": AgentMetadata(fleet_info=DefaultFleetInfo()), + "errors_metadata": errors_metadata, "reporting_interval": timedelta(minutes=13), "sampling_interval": timedelta(seconds=13), "minimum_time_reporting": timedelta(minutes=13), @@ -55,6 +82,7 @@ def before(self): self.environment["agent_config_merger"] = AgentConfigurationMerger(default_config) self.subject = SdkReporter(environment=self.environment) self.subject.setup() + self.subject.errors_metadata.reset() def clear_lambda_specific_environment_variables_for_test_run(self): keys_to_delete = [LAMBDA_TASK_ROOT, LAMBDA_RUNTIME_DIR, LAMBDA_EXECUTION_ENV, LAMBDA_MEMORY_SIZE_ENV] @@ -62,6 +90,7 @@ def clear_lambda_specific_environment_variables_for_test_run(self): if key in os.environ: os.environ.__delitem__(key) + class TestReport(TestSdkReporter): @before def before(self): @@ -84,6 +113,56 @@ def test_return_false_when_report_throws_error(self): with self.client_stubber: assert self.subject.report(profile) is False + def test_errors_metadata_when_post_agent_profile_error(self): + self.client_stubber.add_client_error('post_agent_profile', service_error_code="InternalFailure", + service_message='Simulated error in post_agent_profile call') + with self.client_stubber: + assert self.subject.report(profile) is False + assert self.subject.errors_metadata.serialize_to_json() == { + "configureAgentErrors": 0, + "configureAgentRnfeAutoCreateEnabledErrors": 0, + "createProfilingGroupErrors": 0, + "errorsCount": 1, + "postAgentProfileErrors": 1, + "postAgentProfileRnfeAutoCreateEnabledErrors": 0, + "sdkClientErrors": 1 + } + + def test_errors_metadata_when_post_agent_profile_rnfe_error(self): + self.client_stubber.add_client_error('post_agent_profile', service_error_code="ResourceNotFoundException", + service_message='Simulated error in post_agent_profile call') + with self.client_stubber: + assert self.subject.report(profile) is False + assert self.subject.errors_metadata.serialize_to_json() == { + "configureAgentErrors": 0, + "configureAgentRnfeAutoCreateEnabledErrors": 0, + "createProfilingGroupErrors": 0, + "errorsCount": 1, + "postAgentProfileErrors": 1, + "postAgentProfileRnfeAutoCreateEnabledErrors": 0, + "sdkClientErrors": 1 + } + + def test_errors_metadata_when_post_agent_profile_rnfe_error_auto_create_enabled(self): + self.client_stubber.add_client_error('post_agent_profile', service_error_code="ResourceNotFoundException", + service_message='Simulated error in post_agent_profile call') + self.client_stubber.add_response('create_profiling_group', expected_response_create_pg) + + os.environ.__setitem__(LAMBDA_TASK_ROOT, 'test-task-root') + os.environ.__setitem__(LAMBDA_RUNTIME_DIR, 'test-dir') + + with self.client_stubber: + assert self.subject.report(profile) is False + assert self.subject.errors_metadata.serialize_to_json() == { + "configureAgentErrors": 0, + "configureAgentRnfeAutoCreateEnabledErrors": 0, + "createProfilingGroupErrors": 0, + "errorsCount": 1, + "postAgentProfileErrors": 1, + "postAgentProfileRnfeAutoCreateEnabledErrors": 1, + "sdkClientErrors": 1 + } + def test_create_profiling_group_called_when_pg_does_not_exist_lambda_case(self): expected_params = { 'agentProfile': ANY, @@ -96,29 +175,6 @@ def test_create_profiling_group_called_when_pg_does_not_exist_lambda_case(self): 'post_agent_profile call', expected_params=expected_params) - expected_response_create_pg = { - 'profilingGroup': { - 'agentOrchestrationConfig': { - 'profilingEnabled': True - }, - 'arn': 'string', - 'computePlatform': 'AWSLambda', - 'createdAt': datetime(2015, 1, 1), - 'name': 'string', - 'profilingStatus': { - 'latestAgentOrchestratedAt': datetime(2015, 1, 1), - 'latestAgentProfileReportedAt': datetime(2015, 1, 1), - 'latestAggregatedProfile': { - 'period': 'PT5M', - 'start': datetime(2015, 1, 1) - } - }, - 'tags': { - 'string': 'string' - }, - 'updatedAt': datetime(2015, 1, 1) - } - } self.client_stubber.add_response('create_profiling_group', expected_response_create_pg) expected_params_post_agent_profile = { @@ -184,6 +240,71 @@ def test_agent_configuration_when_configure_agent_throws_error(self): assert AgentConfiguration.get().should_profile is True assert AgentConfiguration.get().sampling_interval == timedelta(seconds=13) + def test_errors_metadata_when_configure_agent_error(self): + self.client_stubber.add_client_error('configure_agent', service_error_code="InternalFailure", + service_message='Simulated error in configure_agent call') + with self.client_stubber: + self.subject.refresh_configuration() + assert self.subject.errors_metadata.serialize_to_json() == { + "configureAgentErrors": 1, + "configureAgentRnfeAutoCreateEnabledErrors": 0, + "createProfilingGroupErrors": 0, + "errorsCount": 1, + "postAgentProfileErrors": 0, + "postAgentProfileRnfeAutoCreateEnabledErrors": 0, + "sdkClientErrors": 1 + } + + def test_errors_metadata_when_configure_agent_validation_exception_error(self): + self.client_stubber.add_client_error('configure_agent', service_error_code="ValidationException", + service_message='Simulated error in configure_agent call') + with self.client_stubber: + self.subject.refresh_configuration() + assert self.subject.errors_metadata.serialize_to_json() == { + "configureAgentErrors": 1, + "configureAgentRnfeAutoCreateEnabledErrors": 0, + "createProfilingGroupErrors": 0, + "errorsCount": 1, + "postAgentProfileErrors": 0, + "postAgentProfileRnfeAutoCreateEnabledErrors": 0, + "sdkClientErrors": 1 + } + + def test_errors_metadata_when_configure_agent_rnfe_error(self): + self.client_stubber.add_client_error('configure_agent', service_error_code="ResourceNotFoundException", + service_message='Simulated error in configure_agent call') + with self.client_stubber: + self.subject.refresh_configuration() + assert self.subject.errors_metadata.serialize_to_json() == { + "configureAgentErrors": 1, + "configureAgentRnfeAutoCreateEnabledErrors": 0, + "createProfilingGroupErrors": 0, + "errorsCount": 1, + "postAgentProfileErrors": 0, + "postAgentProfileRnfeAutoCreateEnabledErrors": 0, + "sdkClientErrors": 1 + } + + def test_errors_metadata_when_configure_agent_rnfe_error_auto_create_enabled(self): + self.client_stubber.add_client_error('configure_agent', service_error_code="ResourceNotFoundException", + service_message='Simulated error in configure_agent call') + self.client_stubber.add_response('create_profiling_group', expected_response_create_pg) + + os.environ.__setitem__(LAMBDA_TASK_ROOT, 'test-task-root') + os.environ.__setitem__(LAMBDA_RUNTIME_DIR, 'test-dir') + + with self.client_stubber: + self.subject.refresh_configuration() + assert self.subject.errors_metadata.serialize_to_json() == { + "configureAgentErrors": 1, + "configureAgentRnfeAutoCreateEnabledErrors": 1, + "createProfilingGroupErrors": 0, + "errorsCount": 1, + "postAgentProfileErrors": 0, + "postAgentProfileRnfeAutoCreateEnabledErrors": 0, + "sdkClientErrors": 1 + } + def test_when_backends_sends_resource_not_found_it_stops_the_profiling_in_non_lambda_case(self): self.client_stubber.add_client_error('configure_agent', service_error_code='ResourceNotFoundException', service_message='Simulated error in configure_agent call') @@ -213,29 +334,6 @@ def test_create_profiling_group_called_when_pg_does_not_exist_in_lambda_case(sel service_message='Simulated ResourceNotFoundException in ' 'configure_agent call') - expected_response_create_pg = { - 'profilingGroup': { - 'agentOrchestrationConfig': { - 'profilingEnabled': True - }, - 'arn': 'string', - 'computePlatform': 'AWSLambda', - 'createdAt': datetime(2015, 1, 1), - 'name': 'string', - 'profilingStatus': { - 'latestAgentOrchestratedAt': datetime(2015, 1, 1), - 'latestAgentProfileReportedAt': datetime(2015, 1, 1), - 'latestAggregatedProfile': { - 'period': 'PT5M', - 'start': datetime(2015, 1, 1) - } - }, - 'tags': { - 'string': 'string' - }, - 'updatedAt': datetime(2015, 1, 1) - } - } self.client_stubber.add_response('create_profiling_group', expected_response_create_pg) os.environ.__setitem__(LAMBDA_MEMORY_SIZE_ENV, "512") @@ -256,3 +354,24 @@ def test_create_pg_not_invoked_in_non_lambda_case(self): with self.client_stubber: self.subject.refresh_configuration() self.client_stubber.assert_no_pending_responses() + + +class TestCreateProfilingGroup(TestSdkReporter): + @before + def before(self): + super().before() + + def test_errors_metadata_when_create_profiling_group_error(self): + self.client_stubber.add_client_error('create_profiling_group', service_error_code="InternalFailure", + service_message='Simulated error in create_profiling_group call') + with self.client_stubber: + self.subject.create_profiling_group() + assert self.subject.errors_metadata.serialize_to_json() == { + "configureAgentErrors": 0, + "configureAgentRnfeAutoCreateEnabledErrors": 0, + "createProfilingGroupErrors": 1, + "errorsCount": 1, + "postAgentProfileErrors": 0, + "postAgentProfileRnfeAutoCreateEnabledErrors": 0, + "sdkClientErrors": 1 + } diff --git a/test/unit/test_local_aggregator.py b/test/unit/test_local_aggregator.py index 604a3ae..ee11ddf 100644 --- a/test/unit/test_local_aggregator.py +++ b/test/unit/test_local_aggregator.py @@ -2,6 +2,9 @@ from datetime import timedelta +from botocore.stub import ANY + +from codeguru_profiler_agent.agent_metadata.agent_debug_info import ErrorsMetadata, AgentDebugInfo from codeguru_profiler_agent.reporter.agent_configuration import AgentConfiguration from codeguru_profiler_agent.utils.time import current_milli_time from test.pytestutils import before @@ -22,6 +25,8 @@ TEST_PROFILING_GROUP_NAME = "test-application" TEST_SAMPLING_INTERVAL = 0.5 TEST_HOST_WEIGHT = 1.0 +ERRORS_METADATA = MagicMock() +AGENT_DEBUG_INFO = MagicMock() ONE_SECOND = timedelta(seconds=1) @@ -35,7 +40,8 @@ def mock_timer(): def assert_profile_is_reset(profile_factory, clock): profile_factory.assert_called_once_with( profiling_group_name=TEST_PROFILING_GROUP_NAME, host_weight=TEST_HOST_WEIGHT, - sampling_interval_seconds=TEST_SAMPLING_INTERVAL, start=current_milli_time(clock), clock=clock) + sampling_interval_seconds=TEST_SAMPLING_INTERVAL, start=current_milli_time(clock), + agent_debug_info=ANY, clock=clock) class TestLocalAggregator: @@ -57,6 +63,7 @@ def before(self): "sampling_interval": timedelta(seconds=TEST_SAMPLING_INTERVAL), "host_weight": TEST_HOST_WEIGHT, "profile_factory": self.mock_profile_factory, + "errors_metadata": ERRORS_METADATA, "memory_limit_bytes": DEFAULT_MEMORY_LIMIT_BYTES, "clock": self.clock, "timer": self.timer, @@ -78,9 +85,9 @@ def before(self): ) ) - assert len(self.environment.keys()) == 7 + assert len(self.environment.keys()) == 8 self.profiler = Profiler(profiling_group_name=TEST_PROFILING_GROUP_NAME, environment_override=self.environment) - assert len(self.environment.keys()) == 6 + assert len(self.environment.keys()) == 7 self.subject = LocalAggregator(**self.configuration) self.mock_profile_factory.reset_mock()