From a659e33bd683292b456450a1345ba0f8f861edfb Mon Sep 17 00:00:00 2001 From: Parag Panda Date: Wed, 21 Apr 2021 19:19:14 -0700 Subject: [PATCH 1/6] Add errors metadata in agent debug info with granular sdk client error metrics Also, adding metrics which tells us how many of RFN exception will try to create profiling group. ErrorsMetadata is part of AgentDebugInfo which is part of the Profile. So, everytime we submit a profile we submit these metrics along with it. SdkClientErrors will tell us total failures happening because of API calls. Individual API call failures are captured with the same name as APIs and the rfne resulting in auto creation of PG is also captured. --- .../agent_metadata/agent_debug_info.py | 70 ++++++ codeguru_profiler_agent/local_aggregator.py | 5 + codeguru_profiler_agent/model/profile.py | 6 +- codeguru_profiler_agent/profiler.py | 3 + .../sdk_reporter/profile_encoder.py | 6 +- .../sdk_reporter/sdk_reporter.py | 17 +- .../utils/synchronization.py | 15 ++ .../test_live_backend_reporting.py | 8 +- .../agent_metadata/test_agent_debug_info.py | 135 +++++++++++ test/unit/model/test_profile.py | 11 +- .../sdk_reporter/test_sdk_profile_encoder.py | 7 +- test/unit/sdk_reporter/test_sdk_reporter.py | 213 ++++++++++++++---- test/unit/test_local_aggregator.py | 13 +- 13 files changed, 449 insertions(+), 60 deletions(-) create mode 100644 codeguru_profiler_agent/agent_metadata/agent_debug_info.py create mode 100644 codeguru_profiler_agent/utils/synchronization.py create mode 100644 test/unit/agent_metadata/test_agent_debug_info.py diff --git a/codeguru_profiler_agent/agent_metadata/agent_debug_info.py b/codeguru_profiler_agent/agent_metadata/agent_debug_info.py new file mode 100644 index 0000000..e4311db --- /dev/null +++ b/codeguru_profiler_agent/agent_metadata/agent_debug_info.py @@ -0,0 +1,70 @@ +from codeguru_profiler_agent.utils.synchronization import synchronized + + +class ErrorsMetadata: + def __init__(self): + self.errors_count = 0 + self.sdk_client_errors = 0 + self.configure_agent_errors = 0 + self.configure_agent_rnfe_auto_create_enabled_errors = 0 + self.create_profiling_group_errors = 0 + self.post_agent_profile_errors = 0 + self.post_agent_profile_rnfe_auto_create_enabled_errors = 0 + + def reset(self): + self.errors_count = 0 + self.sdk_client_errors = 0 + self.configure_agent_errors = 0 + self.configure_agent_rnfe_auto_create_enabled_errors = 0 + self.create_profiling_group_errors = 0 + self.post_agent_profile_errors = 0 + self.post_agent_profile_rnfe_auto_create_enabled_errors = 0 + + """ + This needs to be compliant with errors count schema. + https://code.amazon.com/packages/SkySailProfileIonSchema/blobs/811cc0e7e406e37a5b878acf31468be3dcd2963d/--/src/main/resources/schema/DebugInfo.isl#L21 + """ + + def serialize_to_json(self): + return { + "errorsCount": self.errors_count, + "sdkClientErrors": self.sdk_client_errors, + "configureAgentErrors": self.configure_agent_errors, + "configureAgentRnfeAutoCreateEnabledErrors": self.configure_agent_rnfe_auto_create_enabled_errors, + "createProfilingGroupErrors": self.create_profiling_group_errors, + "postAgentProfileErrors": self.post_agent_profile_errors, + "postAgentProfileRnfeAutoCreateEnabledErrors": self.post_agent_profile_rnfe_auto_create_enabled_errors + } + + @synchronized + def increment_sdk_error(self, error_type): + self.errors_count += 1 + self.sdk_client_errors += 1 + + if error_type == "configureAgentErrors": + self.configure_agent_errors += 1 + elif error_type == "configureAgentRnfeAutoCreateEnabledErrors": + self.configure_agent_rnfe_auto_create_enabled_errors += 1 + elif error_type == "createProfilingGroupErrors": + self.create_profiling_group_errors += 1 + elif error_type == "postAgentProfileErrors": + self.post_agent_profile_errors += 1 + elif error_type == "postAgentProfileRnfeAutoCreateEnabledErrors": + self.post_agent_profile_rnfe_auto_create_enabled_errors += 1 + + def record_sdk_error(self, error_type): + self.increment_sdk_error(error_type) + + +class AgentDebugInfo: + def __init__(self, errors_metadata): + self.errors_metadata = errors_metadata + + def serialize_to_json(self): + """ + This needs to be compliant with agent debug info schema. + https://code.amazon.com/packages/SkySailProfileIonSchema/blobs/811cc0e7e406e37a5b878acf31468be3dcd2963d/--/src/main/resources/schema/DebugInfo.isl#L21 + """ + return { + "errorsCount": self.errors_metadata.serialize_to_json() + } diff --git a/codeguru_profiler_agent/local_aggregator.py b/codeguru_profiler_agent/local_aggregator.py index aa31ea6..ee145b3 100644 --- a/codeguru_profiler_agent/local_aggregator.py +++ b/codeguru_profiler_agent/local_aggregator.py @@ -2,6 +2,7 @@ import time import datetime +from codeguru_profiler_agent.agent_metadata.agent_debug_info import AgentDebugInfo from codeguru_profiler_agent.reporter.agent_configuration import AgentConfiguration from codeguru_profiler_agent.metrics.with_timer import with_timer from codeguru_profiler_agent.model.profile import Profile @@ -30,6 +31,7 @@ def __init__(self, reporter, environment=dict()): :param host_weight: (required inside environment) scale factor used to rescale the profile collected in this host to make the profile representative of the whole fleet :param timer: (required inside environment) timer to be used for metrics + :param errors_metadata: (required inside environment) metadata capturing errors in the current profile. :param profile_factory: (inside environment) the factory to created profiler; default Profile. :param clock: (inside environment) clock to be used; default is time.time """ @@ -37,6 +39,7 @@ def __init__(self, reporter, environment=dict()): self.profiling_group_name = environment["profiling_group_name"] self.host_weight = environment["host_weight"] self.timer = environment["timer"] + self.errors_metadata = environment["errors_metadata"] self.profile_factory = environment.get("profile_factory") or Profile self.clock = environment.get("clock") or time.time @@ -71,11 +74,13 @@ def _check_memory_limit(self): self.flush(force=True) def reset(self): + self.errors_metadata.reset() self.profile = self.profile_factory( profiling_group_name=self.profiling_group_name, sampling_interval_seconds=AgentConfiguration.get().sampling_interval.total_seconds(), host_weight=self.host_weight, start=current_milli_time(clock=self.clock), + agent_debug_info=AgentDebugInfo(self.errors_metadata), clock=self.clock ) self.timer.reset() diff --git a/codeguru_profiler_agent/model/profile.py b/codeguru_profiler_agent/model/profile.py index ba731c9..82f5cb3 100644 --- a/codeguru_profiler_agent/model/profile.py +++ b/codeguru_profiler_agent/model/profile.py @@ -11,7 +11,7 @@ class Profile: - def __init__(self, profiling_group_name, sampling_interval_seconds, host_weight, start, clock=time.time): + def __init__(self, profiling_group_name, sampling_interval_seconds, host_weight, start, agent_debug_info, clock=time.time): """ A profile holds the root node of the call graph and the metadata related to the profile """ @@ -35,6 +35,7 @@ def __init__(self, profiling_group_name, sampling_interval_seconds, host_weight, self.host_weight = int(host_weight) self._start_process_time = time.process_time() # provides process time in fractional seconds as float. self.overhead_ms = 0 + self.agent_debug_info = agent_debug_info @property def end(self): @@ -97,6 +98,9 @@ def _insert_stack(self, stack, runnable_count_increase=1): def get_memory_usage_bytes(self): return self.memory_counter.get_memory_usage_bytes() + def serialize_agent_debug_info_to_json(self): + return self.agent_debug_info.serialize_to_json() + def pause(self): if self.last_pause is not None: # pause gets called when profile is paused diff --git a/codeguru_profiler_agent/profiler.py b/codeguru_profiler_agent/profiler.py index 854f07c..5f7c8bb 100644 --- a/codeguru_profiler_agent/profiler.py +++ b/codeguru_profiler_agent/profiler.py @@ -7,6 +7,8 @@ from datetime import timedelta from random import SystemRandom from types import MappingProxyType as UnmodifiableDict + +from codeguru_profiler_agent.agent_metadata.agent_debug_info import ErrorsMetadata from codeguru_profiler_agent.agent_metadata.agent_metadata import AgentMetadata from codeguru_profiler_agent.profiler_disabler import ProfilerDisabler from codeguru_profiler_agent.reporter.agent_configuration import AgentConfiguration, AgentConfigurationMerger @@ -167,6 +169,7 @@ def _setup_final_environment(self, environment, environment_override): frozenset({environment['profiler_thread_name']}.union(environment['excluded_threads'])) # TODO delay metadata lookup until we need it environment['agent_metadata'] = environment.get('agent_metadata') or AgentMetadata() + environment['errors_metadata'] = environment.get('errors_metadata') or ErrorsMetadata() environment['collector'] = environment.get('collector') or self._select_collector(environment) environment["profiler_disabler"] = environment.get('profiler_disabler') or ProfilerDisabler(environment) return UnmodifiableDict(environment) diff --git a/codeguru_profiler_agent/sdk_reporter/profile_encoder.py b/codeguru_profiler_agent/sdk_reporter/profile_encoder.py index 315f3bf..dcc82d7 100644 --- a/codeguru_profiler_agent/sdk_reporter/profile_encoder.py +++ b/codeguru_profiler_agent/sdk_reporter/profile_encoder.py @@ -99,11 +99,15 @@ def encode_content(self): "start": int(self._profile.start), "end": int(self._profile.end), "agentMetadata": self._encode_agent_metadata(), - "callgraph": self._encode_call_graph(self._profile.callgraph) + "callgraph": self._encode_call_graph(self._profile.callgraph), + "debugInfo": self._encode_debug_info() } return json.dumps(profile_in_map) + def _encode_debug_info(self): + return self._profile.serialize_agent_debug_info_to_json() + def _encode_agent_metadata(self): profile_duration_seconds = self._profile.get_active_millis_since_start() / 1000.0 sample_weight = 1.0 if (profile_duration_seconds == 0) else self._profile.total_sample_count / profile_duration_seconds diff --git a/codeguru_profiler_agent/sdk_reporter/sdk_reporter.py b/codeguru_profiler_agent/sdk_reporter/sdk_reporter.py index 5df5e01..43e10cb 100644 --- a/codeguru_profiler_agent/sdk_reporter/sdk_reporter.py +++ b/codeguru_profiler_agent/sdk_reporter/sdk_reporter.py @@ -15,12 +15,14 @@ logger = logging.getLogger(__name__) AWS_EXECUTION_ENV_KEY = "AWS_EXECUTION_ENV" + class SdkReporter(Reporter): """ Handles communication with the CodeGuru Profiler Service backend. Encodes profiles using the ProfilerEncoder and reports them using the CodeGuru profiler SDK. """ is_create_pg_called_during_submit_profile = False + def __init__(self, environment): """ :param environment: dependency container dictionary for the current profiler. @@ -35,6 +37,7 @@ def __init__(self, environment): self.timer = environment.get("timer") self.metadata = environment["agent_metadata"] self.agent_config_merger = environment["agent_config_merger"] + self.errors_metadata = environment["errors_metadata"] def _encode_profile(self, profile): output_profile_stream = io.BytesIO() @@ -76,10 +79,12 @@ def refresh_configuration(self): # We handle service exceptions like this in boto3 # see https://boto3.amazonaws.com/v1/documentation/api/latest/guide/error-handling.html if error.response['Error']['Code'] == 'ValidationException': + self.errors_metadata.record_sdk_error("configureAgentErrors") self.agent_config_merger.disable_profiling() self._log_request_failed(operation="configure_agent", exception=error) - if error.response['Error']['Code'] == 'ResourceNotFoundException': + elif error.response['Error']['Code'] == 'ResourceNotFoundException': if self.should_auto_create_profiling_group(): + self.errors_metadata.record_sdk_error("configureAgentRnfeAutoCreateEnabledErrors") logger.info( "Profiling group not found. Will try to create a profiling group " "with name = {} and compute platform = {} and retry calling configure agent after 5 minutes. " @@ -87,7 +92,10 @@ def refresh_configuration(self): .format(self.profiling_group_name, 'AWSLambda')) self.create_profiling_group() else: + self.errors_metadata.record_sdk_error("configureAgentErrors") self.agent_config_merger.disable_profiling() + else: + self.errors_metadata.record_sdk_error("configureAgentErrors") except Exception as e: self._log_request_failed(operation="configure_agent", exception=e) @@ -117,12 +125,17 @@ def report(self, profile): if error.response['Error']['Code'] == 'ResourceNotFoundException': if self.should_auto_create_profiling_group(): self.__class__.is_create_pg_called_during_submit_profile = True + self.errors_metadata.record_sdk_error("postAgentProfileRnfeAutoCreateEnabledErrors") logger.info( "Profiling group not found. Will try to create a profiling group " "with name = {} and compute platform = {} and retry reporting during next invocation. " "Make sure that Lambda's execution role has AmazonCodeGuruProfilerAgentAccess policy added." .format(self.profiling_group_name, 'AWSLambda')) self.create_profiling_group() + else: + self.errors_metadata.record_sdk_error("postAgentProfileErrors") + else: + self.errors_metadata.record_sdk_error("postAgentProfileErrors") return False except Exception as e: self._log_request_failed(operation="post_agent_profile", exception=e) @@ -143,6 +156,8 @@ def create_profiling_group(self): if error.response['Error']['Code'] == 'ConflictException': logger.info("Profiling Group with name {} already exists. Please use a different name." .format(self.profiling_group_name)) + else: + self.errors_metadata.record_sdk_error("createProfilingGroupErrors") except Exception as e: self._log_request_failed(operation="create_profiling_group", exception=e) diff --git a/codeguru_profiler_agent/utils/synchronization.py b/codeguru_profiler_agent/utils/synchronization.py new file mode 100644 index 0000000..1d68ed5 --- /dev/null +++ b/codeguru_profiler_agent/utils/synchronization.py @@ -0,0 +1,15 @@ +import functools +import threading + + +def synchronized(wrapped): + """The missing @synchronized decorator + + https://git.io/vydTA""" + _lock = threading.RLock() + + @functools.wraps(wrapped) + def _wrapper(*args, **kwargs): + with _lock: + return wrapped(*args, **kwargs) + return _wrapper \ No newline at end of file diff --git a/test/integration/test_live_backend_reporting.py b/test/integration/test_live_backend_reporting.py index a79f231..832bbc1 100644 --- a/test/integration/test_live_backend_reporting.py +++ b/test/integration/test_live_backend_reporting.py @@ -5,6 +5,7 @@ from datetime import timedelta +from codeguru_profiler_agent.agent_metadata.agent_debug_info import ErrorsMetadata, AgentDebugInfo from test.help_utils import MY_PROFILING_GROUP_NAME_FOR_INTEG_TESTS from test.pytestutils import before @@ -32,8 +33,10 @@ def before(self): stacks=[[Frame(MY_PROFILING_GROUP_NAME_FOR_INTEG_TESTS)]], attempted_sample_threads_count=1, seen_threads_count=1) + errors_metadata = ErrorsMetadata() - self.profile = Profile(MY_PROFILING_GROUP_NAME_FOR_INTEG_TESTS, 1.0, 1.0, five_minutes_ago_millis) + self.profile = Profile(MY_PROFILING_GROUP_NAME_FOR_INTEG_TESTS, 1.0, 1.0, five_minutes_ago_millis, + AgentDebugInfo(errors_metadata)) # FIXME: Remove adding the end time manually below after feature fully support self.profile.end = now_millis self.profile.add(sample) @@ -47,7 +50,8 @@ def before(self): "minimum_time_reporting": timedelta(minutes=6), "max_stack_depth": 2345, "cpu_limit_percentage": 29, - "agent_metadata": AgentMetadata(fleet_info=DefaultFleetInfo()) + "agent_metadata": AgentMetadata(fleet_info=DefaultFleetInfo()), + "errors_metadata": errors_metadata } self.environment["codeguru_profiler_builder"] = CodeGuruClientBuilder(self.environment) self.agent_config = AgentConfiguration( diff --git a/test/unit/agent_metadata/test_agent_debug_info.py b/test/unit/agent_metadata/test_agent_debug_info.py new file mode 100644 index 0000000..6d18f2e --- /dev/null +++ b/test/unit/agent_metadata/test_agent_debug_info.py @@ -0,0 +1,135 @@ +import json + +from codeguru_profiler_agent.agent_metadata.agent_debug_info import ErrorsMetadata, AgentDebugInfo + + +class TestAgentDebugInfo: + class TestAgentDebugInfo: + def test_it_returns_json_with_error_counts(self): + errors_metadata = ErrorsMetadata() + errors_metadata.record_sdk_error("createProfilingGroupErrors") + subject = AgentDebugInfo(errors_metadata) + + assert subject.serialize_to_json() == { + "errorsCount": { + "configureAgentErrors": 0, + "configureAgentRnfeAutoCreateEnabledErrors": 0, + "createProfilingGroupErrors": 1, + "errorsCount": 1, + "postAgentProfileErrors": 0, + "postAgentProfileRnfeAutoCreateEnabledErrors": 0, + "sdkClientErrors": 1 + } + } + + class TestErrorsMetadata: + class TestSerializeToJson: + def test_it_returns_json_with_error_counts(self): + subject = ErrorsMetadata() + subject.record_sdk_error("configureAgentErrors") + + assert subject.serialize_to_json() == { + "errorsCount": 1, + "sdkClientErrors": 1, + "configureAgentErrors": 1, + "configureAgentRnfeAutoCreateEnabledErrors": 0, + "createProfilingGroupErrors": 0, + "postAgentProfileErrors": 0, + "postAgentProfileRnfeAutoCreateEnabledErrors": 0 + } + + class TestRecordSdkError: + def test_it_increments_error_count_when_configureAgentErrors(self): + subject = ErrorsMetadata() + subject.record_sdk_error("configureAgentErrors") + + assert subject.errors_count == 1 + assert subject.sdk_client_errors == 1 + assert subject.configure_agent_errors == 1 + assert subject.configure_agent_rnfe_auto_create_enabled_errors == 0 + assert subject.create_profiling_group_errors == 0 + assert subject.post_agent_profile_errors == 0 + assert subject.post_agent_profile_rnfe_auto_create_enabled_errors == 0 + + def test_it_increments_error_count_when_configureAgentRnfeAutoCreateEnabledErrors(self): + subject = ErrorsMetadata() + subject.record_sdk_error("configureAgentRnfeAutoCreateEnabledErrors") + + assert subject.errors_count == 1 + assert subject.sdk_client_errors == 1 + assert subject.configure_agent_errors == 0 + assert subject.configure_agent_rnfe_auto_create_enabled_errors == 1 + assert subject.create_profiling_group_errors == 0 + assert subject.post_agent_profile_errors == 0 + assert subject.post_agent_profile_rnfe_auto_create_enabled_errors == 0 + + def test_it_increments_error_count_when_createProfilingGroupErrors(self): + subject = ErrorsMetadata() + subject.record_sdk_error("createProfilingGroupErrors") + + assert subject.errors_count == 1 + assert subject.sdk_client_errors == 1 + assert subject.configure_agent_errors == 0 + assert subject.configure_agent_rnfe_auto_create_enabled_errors == 0 + assert subject.create_profiling_group_errors == 1 + assert subject.post_agent_profile_errors == 0 + assert subject.post_agent_profile_rnfe_auto_create_enabled_errors == 0 + + def test_it_increments_error_count_when_postAgentProfileErrors(self): + subject = ErrorsMetadata() + subject.record_sdk_error("postAgentProfileErrors") + + assert subject.errors_count == 1 + assert subject.sdk_client_errors == 1 + assert subject.configure_agent_errors == 0 + assert subject.configure_agent_rnfe_auto_create_enabled_errors == 0 + assert subject.create_profiling_group_errors == 0 + assert subject.post_agent_profile_errors == 1 + assert subject.post_agent_profile_rnfe_auto_create_enabled_errors == 0 + + def test_it_increments_error_count_when_postAgentProfileRnfeAutoCreateEnabledErrors(self): + subject = ErrorsMetadata() + subject.record_sdk_error("postAgentProfileRnfeAutoCreateEnabledErrors") + + assert subject.errors_count == 1 + assert subject.sdk_client_errors == 1 + assert subject.configure_agent_errors == 0 + assert subject.configure_agent_rnfe_auto_create_enabled_errors == 0 + assert subject.create_profiling_group_errors == 0 + assert subject.post_agent_profile_errors == 0 + assert subject.post_agent_profile_rnfe_auto_create_enabled_errors == 1 + + def test_it_increments_error_count_when_multiple_errors(self): + subject = ErrorsMetadata() + subject.record_sdk_error("createProfilingGroupErrors") + subject.record_sdk_error("postAgentProfileRnfeAutoCreateEnabledErrors") + + assert subject.errors_count == 2 + assert subject.sdk_client_errors == 2 + assert subject.configure_agent_errors == 0 + assert subject.configure_agent_rnfe_auto_create_enabled_errors == 0 + assert subject.create_profiling_group_errors == 1 + assert subject.post_agent_profile_errors == 0 + assert subject.post_agent_profile_rnfe_auto_create_enabled_errors == 1 + + def test_it_resets_error_count(self): + subject = ErrorsMetadata() + subject.record_sdk_error("createProfilingGroupErrors") + + assert subject.errors_count == 1 + assert subject.sdk_client_errors == 1 + assert subject.configure_agent_errors == 0 + assert subject.configure_agent_rnfe_auto_create_enabled_errors == 0 + assert subject.create_profiling_group_errors == 1 + assert subject.post_agent_profile_errors == 0 + assert subject.post_agent_profile_rnfe_auto_create_enabled_errors == 0 + + subject.reset() + + assert subject.errors_count == 0 + assert subject.sdk_client_errors == 0 + assert subject.configure_agent_errors == 0 + assert subject.configure_agent_rnfe_auto_create_enabled_errors == 0 + assert subject.create_profiling_group_errors == 0 + assert subject.post_agent_profile_errors == 0 + assert subject.post_agent_profile_rnfe_auto_create_enabled_errors == 0 diff --git a/test/unit/model/test_profile.py b/test/unit/model/test_profile.py index 667b038..07c0537 100644 --- a/test/unit/model/test_profile.py +++ b/test/unit/model/test_profile.py @@ -1,6 +1,7 @@ import pytest from unittest.mock import Mock +from codeguru_profiler_agent.agent_metadata.agent_debug_info import AgentDebugInfo, ErrorsMetadata from codeguru_profiler_agent.model.frame import Frame from test.pytestutils import before import datetime @@ -19,6 +20,7 @@ def before(self): sampling_interval_seconds=1.0, host_weight=2, start=self.test_start_time, + agent_debug_info=AgentDebugInfo(ErrorsMetadata()), clock=self.mock_clock ) @@ -292,11 +294,14 @@ def test_it_stores_valid_start_time(self): def test_it_raise_exception_for_invalid_start_time(self): with pytest.raises(ValueError): - Profile(profiling_group_name="foo", sampling_interval_seconds=1.0, host_weight=2, start=0) + Profile(profiling_group_name="foo", sampling_interval_seconds=1.0, host_weight=2, start=0, + agent_debug_info=AgentDebugInfo(ErrorsMetadata)) with pytest.raises(ValueError): - Profile(profiling_group_name="foo", sampling_interval_seconds=1.0, host_weight=2, start=-100) + Profile(profiling_group_name="foo", sampling_interval_seconds=1.0, host_weight=2, start=-100, + agent_debug_info=AgentDebugInfo(ErrorsMetadata)) with pytest.raises(TypeError): - Profile(profiling_group_name="foo", sampling_interval_seconds=1.0, host_weight=2, start=None) + Profile(profiling_group_name="foo", sampling_interval_seconds=1.0, host_weight=2, start=None, + agent_debug_info=AgentDebugInfo(ErrorsMetadata)) class TestEndTime(TestProfile): diff --git a/test/unit/sdk_reporter/test_sdk_profile_encoder.py b/test/unit/sdk_reporter/test_sdk_profile_encoder.py index f3092f2..1607b0f 100644 --- a/test/unit/sdk_reporter/test_sdk_profile_encoder.py +++ b/test/unit/sdk_reporter/test_sdk_profile_encoder.py @@ -4,6 +4,7 @@ import pytest from unittest.mock import MagicMock +from codeguru_profiler_agent.agent_metadata.agent_debug_info import ErrorsMetadata, AgentDebugInfo from codeguru_profiler_agent.agent_metadata.agent_metadata import AgentMetadata from codeguru_profiler_agent.agent_metadata.aws_ec2_instance import AWSEC2Instance from codeguru_profiler_agent.model.frame import Frame @@ -23,7 +24,7 @@ def example_profile(): start_time = 1514764800000 end_time = 1514772000000 profile = Profile(profiling_group_name="TestProfilingGroupName", sampling_interval_seconds=1.0, host_weight=2, - start=start_time, clock=lambda: 1514772000000) + start=start_time, agent_debug_info=AgentDebugInfo(errors_metadata), clock=lambda: 1514772000000) profile.add( Sample(stacks=[[Frame("bottom"), Frame("middle"), Frame("top")], [Frame("bottom"), Frame("middle"), Frame("different_top")], @@ -40,10 +41,12 @@ def example_profile(): agent_metadata = AgentMetadata( fleet_info=AWSEC2Instance(host_name="testHostName", host_type="testHostType") ) +errors_metadata = ErrorsMetadata() environment = { "timer": Timer(), - "agent_metadata": agent_metadata + "agent_metadata": agent_metadata, + "errors_metadata": errors_metadata } diff --git a/test/unit/sdk_reporter/test_sdk_reporter.py b/test/unit/sdk_reporter/test_sdk_reporter.py index 287894d..c43bf51 100644 --- a/test/unit/sdk_reporter/test_sdk_reporter.py +++ b/test/unit/sdk_reporter/test_sdk_reporter.py @@ -5,6 +5,7 @@ from datetime import timedelta, datetime +from codeguru_profiler_agent.agent_metadata.agent_debug_info import ErrorsMetadata, AgentDebugInfo from codeguru_profiler_agent.utils.time import current_milli_time from test.pytestutils import before from unittest.mock import MagicMock @@ -22,7 +23,32 @@ profiling_group_name = "test-ProfilingGroup-name" autocreated_test_lambda_profiling_group_name = "aws-lambda-testLambdaName" -profile = Profile(profiling_group_name, 1.0, 0.5, current_milli_time()) +errors_metadata = ErrorsMetadata() +profile = Profile(profiling_group_name, 1.0, 0.5, current_milli_time(), AgentDebugInfo(errors_metadata)) +expected_response_create_pg = { + 'profilingGroup': { + 'agentOrchestrationConfig': { + 'profilingEnabled': True + }, + 'arn': 'string', + 'computePlatform': 'AWSLambda', + 'createdAt': datetime(2015, 1, 1), + 'name': 'string', + 'profilingStatus': { + 'latestAgentOrchestratedAt': datetime(2015, 1, 1), + 'latestAgentProfileReportedAt': datetime(2015, 1, 1), + 'latestAggregatedProfile': { + 'period': 'PT5M', + 'start': datetime(2015, 1, 1) + } + }, + 'tags': { + 'string': 'string' + }, + 'updatedAt': datetime(2015, 1, 1) + } +} + class TestSdkReporter: def before(self): @@ -42,6 +68,7 @@ def before(self): "profile_encoder": profile_encoder, "codeguru_profiler_builder": codeguru_client_builder, "agent_metadata": AgentMetadata(fleet_info=DefaultFleetInfo()), + "errors_metadata": errors_metadata, "reporting_interval": timedelta(minutes=13), "sampling_interval": timedelta(seconds=13), "minimum_time_reporting": timedelta(minutes=13), @@ -55,6 +82,7 @@ def before(self): self.environment["agent_config_merger"] = AgentConfigurationMerger(default_config) self.subject = SdkReporter(environment=self.environment) self.subject.setup() + self.subject.errors_metadata.reset() def clear_lambda_specific_environment_variables_for_test_run(self): keys_to_delete = [LAMBDA_TASK_ROOT, LAMBDA_RUNTIME_DIR, LAMBDA_EXECUTION_ENV, LAMBDA_MEMORY_SIZE_ENV] @@ -62,6 +90,7 @@ def clear_lambda_specific_environment_variables_for_test_run(self): if key in os.environ: os.environ.__delitem__(key) + class TestReport(TestSdkReporter): @before def before(self): @@ -84,6 +113,56 @@ def test_return_false_when_report_throws_error(self): with self.client_stubber: assert self.subject.report(profile) is False + def test_errors_metadata_when_post_agent_profile_error(self): + self.client_stubber.add_client_error('post_agent_profile', service_error_code="InternalFailure", + service_message='Simulated error in post_agent_profile call') + with self.client_stubber: + assert self.subject.report(profile) is False + assert self.subject.errors_metadata.serialize_to_json() == { + "configureAgentErrors": 0, + "configureAgentRnfeAutoCreateEnabledErrors": 0, + "createProfilingGroupErrors": 0, + "errorsCount": 1, + "postAgentProfileErrors": 1, + "postAgentProfileRnfeAutoCreateEnabledErrors": 0, + "sdkClientErrors": 1 + } + + def test_errors_metadata_when_post_agent_profile_rnfe_error(self): + self.client_stubber.add_client_error('post_agent_profile', service_error_code="ResourceNotFoundException", + service_message='Simulated error in post_agent_profile call') + with self.client_stubber: + assert self.subject.report(profile) is False + assert self.subject.errors_metadata.serialize_to_json() == { + "configureAgentErrors": 0, + "configureAgentRnfeAutoCreateEnabledErrors": 0, + "createProfilingGroupErrors": 0, + "errorsCount": 1, + "postAgentProfileErrors": 1, + "postAgentProfileRnfeAutoCreateEnabledErrors": 0, + "sdkClientErrors": 1 + } + + def test_errors_metadata_when_post_agent_profile_rnfe_error_auto_create_enabled(self): + self.client_stubber.add_client_error('post_agent_profile', service_error_code="ResourceNotFoundException", + service_message='Simulated error in post_agent_profile call') + self.client_stubber.add_response('create_profiling_group', expected_response_create_pg) + + os.environ.__setitem__(LAMBDA_TASK_ROOT, 'test-task-root') + os.environ.__setitem__(LAMBDA_RUNTIME_DIR, 'test-dir') + + with self.client_stubber: + assert self.subject.report(profile) is False + assert self.subject.errors_metadata.serialize_to_json() == { + "configureAgentErrors": 0, + "configureAgentRnfeAutoCreateEnabledErrors": 0, + "createProfilingGroupErrors": 0, + "errorsCount": 1, + "postAgentProfileErrors": 0, + "postAgentProfileRnfeAutoCreateEnabledErrors": 1, + "sdkClientErrors": 1 + } + def test_create_profiling_group_called_when_pg_does_not_exist_lambda_case(self): expected_params = { 'agentProfile': ANY, @@ -96,29 +175,6 @@ def test_create_profiling_group_called_when_pg_does_not_exist_lambda_case(self): 'post_agent_profile call', expected_params=expected_params) - expected_response_create_pg = { - 'profilingGroup': { - 'agentOrchestrationConfig': { - 'profilingEnabled': True - }, - 'arn': 'string', - 'computePlatform': 'AWSLambda', - 'createdAt': datetime(2015, 1, 1), - 'name': 'string', - 'profilingStatus': { - 'latestAgentOrchestratedAt': datetime(2015, 1, 1), - 'latestAgentProfileReportedAt': datetime(2015, 1, 1), - 'latestAggregatedProfile': { - 'period': 'PT5M', - 'start': datetime(2015, 1, 1) - } - }, - 'tags': { - 'string': 'string' - }, - 'updatedAt': datetime(2015, 1, 1) - } - } self.client_stubber.add_response('create_profiling_group', expected_response_create_pg) expected_params_post_agent_profile = { @@ -184,6 +240,71 @@ def test_agent_configuration_when_configure_agent_throws_error(self): assert AgentConfiguration.get().should_profile is True assert AgentConfiguration.get().sampling_interval == timedelta(seconds=13) + def test_errors_metadata_when_configure_agent_error(self): + self.client_stubber.add_client_error('configure_agent', service_error_code="InternalFailure", + service_message='Simulated error in configure_agent call') + with self.client_stubber: + self.subject.refresh_configuration() + assert self.subject.errors_metadata.serialize_to_json() == { + "configureAgentErrors": 1, + "configureAgentRnfeAutoCreateEnabledErrors": 0, + "createProfilingGroupErrors": 0, + "errorsCount": 1, + "postAgentProfileErrors": 0, + "postAgentProfileRnfeAutoCreateEnabledErrors": 0, + "sdkClientErrors": 1 + } + + def test_errors_metadata_when_configure_agent_validation_exception_error(self): + self.client_stubber.add_client_error('configure_agent', service_error_code="ValidationException", + service_message='Simulated error in configure_agent call') + with self.client_stubber: + self.subject.refresh_configuration() + assert self.subject.errors_metadata.serialize_to_json() == { + "configureAgentErrors": 1, + "configureAgentRnfeAutoCreateEnabledErrors": 0, + "createProfilingGroupErrors": 0, + "errorsCount": 1, + "postAgentProfileErrors": 0, + "postAgentProfileRnfeAutoCreateEnabledErrors": 0, + "sdkClientErrors": 1 + } + + def test_errors_metadata_when_configure_agent_rnfe_error(self): + self.client_stubber.add_client_error('configure_agent', service_error_code="ResourceNotFoundException", + service_message='Simulated error in configure_agent call') + with self.client_stubber: + self.subject.refresh_configuration() + assert self.subject.errors_metadata.serialize_to_json() == { + "configureAgentErrors": 1, + "configureAgentRnfeAutoCreateEnabledErrors": 0, + "createProfilingGroupErrors": 0, + "errorsCount": 1, + "postAgentProfileErrors": 0, + "postAgentProfileRnfeAutoCreateEnabledErrors": 0, + "sdkClientErrors": 1 + } + + def test_errors_metadata_when_configure_agent_rnfe_error_auto_create_enabled(self): + self.client_stubber.add_client_error('configure_agent', service_error_code="ResourceNotFoundException", + service_message='Simulated error in configure_agent call') + self.client_stubber.add_response('create_profiling_group', expected_response_create_pg) + + os.environ.__setitem__(LAMBDA_TASK_ROOT, 'test-task-root') + os.environ.__setitem__(LAMBDA_RUNTIME_DIR, 'test-dir') + + with self.client_stubber: + self.subject.refresh_configuration() + assert self.subject.errors_metadata.serialize_to_json() == { + "configureAgentErrors": 0, + "configureAgentRnfeAutoCreateEnabledErrors": 1, + "createProfilingGroupErrors": 0, + "errorsCount": 1, + "postAgentProfileErrors": 0, + "postAgentProfileRnfeAutoCreateEnabledErrors": 0, + "sdkClientErrors": 1 + } + def test_when_backends_sends_resource_not_found_it_stops_the_profiling_in_non_lambda_case(self): self.client_stubber.add_client_error('configure_agent', service_error_code='ResourceNotFoundException', service_message='Simulated error in configure_agent call') @@ -213,29 +334,6 @@ def test_create_profiling_group_called_when_pg_does_not_exist_in_lambda_case(sel service_message='Simulated ResourceNotFoundException in ' 'configure_agent call') - expected_response_create_pg = { - 'profilingGroup': { - 'agentOrchestrationConfig': { - 'profilingEnabled': True - }, - 'arn': 'string', - 'computePlatform': 'AWSLambda', - 'createdAt': datetime(2015, 1, 1), - 'name': 'string', - 'profilingStatus': { - 'latestAgentOrchestratedAt': datetime(2015, 1, 1), - 'latestAgentProfileReportedAt': datetime(2015, 1, 1), - 'latestAggregatedProfile': { - 'period': 'PT5M', - 'start': datetime(2015, 1, 1) - } - }, - 'tags': { - 'string': 'string' - }, - 'updatedAt': datetime(2015, 1, 1) - } - } self.client_stubber.add_response('create_profiling_group', expected_response_create_pg) os.environ.__setitem__(LAMBDA_MEMORY_SIZE_ENV, "512") @@ -256,3 +354,24 @@ def test_create_pg_not_invoked_in_non_lambda_case(self): with self.client_stubber: self.subject.refresh_configuration() self.client_stubber.assert_no_pending_responses() + + +class TestCreateProfilingGroup(TestSdkReporter): + @before + def before(self): + super().before() + + def test_errors_metadata_when_create_profiling_group_error(self): + self.client_stubber.add_client_error('create_profiling_group', service_error_code="InternalFailure", + service_message='Simulated error in create_profiling_group call') + with self.client_stubber: + self.subject.create_profiling_group() + assert self.subject.errors_metadata.serialize_to_json() == { + "configureAgentErrors": 0, + "configureAgentRnfeAutoCreateEnabledErrors": 0, + "createProfilingGroupErrors": 1, + "errorsCount": 1, + "postAgentProfileErrors": 0, + "postAgentProfileRnfeAutoCreateEnabledErrors": 0, + "sdkClientErrors": 1 + } diff --git a/test/unit/test_local_aggregator.py b/test/unit/test_local_aggregator.py index 604a3ae..ee11ddf 100644 --- a/test/unit/test_local_aggregator.py +++ b/test/unit/test_local_aggregator.py @@ -2,6 +2,9 @@ from datetime import timedelta +from botocore.stub import ANY + +from codeguru_profiler_agent.agent_metadata.agent_debug_info import ErrorsMetadata, AgentDebugInfo from codeguru_profiler_agent.reporter.agent_configuration import AgentConfiguration from codeguru_profiler_agent.utils.time import current_milli_time from test.pytestutils import before @@ -22,6 +25,8 @@ TEST_PROFILING_GROUP_NAME = "test-application" TEST_SAMPLING_INTERVAL = 0.5 TEST_HOST_WEIGHT = 1.0 +ERRORS_METADATA = MagicMock() +AGENT_DEBUG_INFO = MagicMock() ONE_SECOND = timedelta(seconds=1) @@ -35,7 +40,8 @@ def mock_timer(): def assert_profile_is_reset(profile_factory, clock): profile_factory.assert_called_once_with( profiling_group_name=TEST_PROFILING_GROUP_NAME, host_weight=TEST_HOST_WEIGHT, - sampling_interval_seconds=TEST_SAMPLING_INTERVAL, start=current_milli_time(clock), clock=clock) + sampling_interval_seconds=TEST_SAMPLING_INTERVAL, start=current_milli_time(clock), + agent_debug_info=ANY, clock=clock) class TestLocalAggregator: @@ -57,6 +63,7 @@ def before(self): "sampling_interval": timedelta(seconds=TEST_SAMPLING_INTERVAL), "host_weight": TEST_HOST_WEIGHT, "profile_factory": self.mock_profile_factory, + "errors_metadata": ERRORS_METADATA, "memory_limit_bytes": DEFAULT_MEMORY_LIMIT_BYTES, "clock": self.clock, "timer": self.timer, @@ -78,9 +85,9 @@ def before(self): ) ) - assert len(self.environment.keys()) == 7 + assert len(self.environment.keys()) == 8 self.profiler = Profiler(profiling_group_name=TEST_PROFILING_GROUP_NAME, environment_override=self.environment) - assert len(self.environment.keys()) == 6 + assert len(self.environment.keys()) == 7 self.subject = LocalAggregator(**self.configuration) self.mock_profile_factory.reset_mock() From 8f30db7efdb4f549868f9e8dabccef17ed300de7 Mon Sep 17 00:00:00 2001 From: Parag Panda Date: Thu, 22 Apr 2021 11:32:18 -0700 Subject: [PATCH 2/6] Fix error count calculation for Rnfe errors Added comments for more clarity. --- .../agent_metadata/agent_debug_info.py | 89 ++++-- codeguru_profiler_agent/local_aggregator.py | 5 +- .../utils/synchronization.py | 15 - .../agent_metadata/test_agent_debug_info.py | 268 ++++++++++-------- test/unit/sdk_reporter/test_sdk_reporter.py | 4 +- 5 files changed, 217 insertions(+), 164 deletions(-) delete mode 100644 codeguru_profiler_agent/utils/synchronization.py diff --git a/codeguru_profiler_agent/agent_metadata/agent_debug_info.py b/codeguru_profiler_agent/agent_metadata/agent_debug_info.py index e4311db..5aac744 100644 --- a/codeguru_profiler_agent/agent_metadata/agent_debug_info.py +++ b/codeguru_profiler_agent/agent_metadata/agent_debug_info.py @@ -1,17 +1,19 @@ -from codeguru_profiler_agent.utils.synchronization import synchronized +import logging +import os + +logger = logging.getLogger(__name__) class ErrorsMetadata: def __init__(self): - self.errors_count = 0 - self.sdk_client_errors = 0 - self.configure_agent_errors = 0 - self.configure_agent_rnfe_auto_create_enabled_errors = 0 - self.create_profiling_group_errors = 0 - self.post_agent_profile_errors = 0 - self.post_agent_profile_rnfe_auto_create_enabled_errors = 0 + self.reset() def reset(self): + """ + We want to differentiate API call errors more granularly. We want to gather ResourceNotFoundException errors + because we are going to get this exception with auto-create feature and want to monitor how many times + the agent is not able to create the PG and resulting in subsequent ResourceNotFoundException. + """ self.errors_count = 0 self.sdk_client_errors = 0 self.configure_agent_errors = 0 @@ -20,12 +22,10 @@ def reset(self): self.post_agent_profile_errors = 0 self.post_agent_profile_rnfe_auto_create_enabled_errors = 0 - """ - This needs to be compliant with errors count schema. - https://code.amazon.com/packages/SkySailProfileIonSchema/blobs/811cc0e7e406e37a5b878acf31468be3dcd2963d/--/src/main/resources/schema/DebugInfo.isl#L21 - """ - def serialize_to_json(self): + """ + This needs to be compliant with errors count schema. + """ return { "errorsCount": self.errors_count, "sdkClientErrors": self.sdk_client_errors, @@ -36,20 +36,31 @@ def serialize_to_json(self): "postAgentProfileRnfeAutoCreateEnabledErrors": self.post_agent_profile_rnfe_auto_create_enabled_errors } - @synchronized def increment_sdk_error(self, error_type): + """ + ErrorsCount is the umbrella of all the kinds of error we want to capture. Currently we have only SdkClientErrors + in it. SdkClientErrors is comprised of different API level errors like ConfigureAgentErrors, + PostAgentProfileErrors, CreateProfilingGroupErrors. + :param error_type: The type of API level error that we want to capture. + """ self.errors_count += 1 self.sdk_client_errors += 1 + """ + Special handling for ResourceNotFoundException errors. + For example configureAgentRnfeAutoCreateEnabledErrors is also a configureAgentErrors. + """ if error_type == "configureAgentErrors": self.configure_agent_errors += 1 elif error_type == "configureAgentRnfeAutoCreateEnabledErrors": + self.configure_agent_errors += 1 self.configure_agent_rnfe_auto_create_enabled_errors += 1 elif error_type == "createProfilingGroupErrors": self.create_profiling_group_errors += 1 elif error_type == "postAgentProfileErrors": self.post_agent_profile_errors += 1 elif error_type == "postAgentProfileRnfeAutoCreateEnabledErrors": + self.post_agent_profile_errors += 1 self.post_agent_profile_rnfe_auto_create_enabled_errors += 1 def record_sdk_error(self, error_type): @@ -57,14 +68,54 @@ def record_sdk_error(self, error_type): class AgentDebugInfo: - def __init__(self, errors_metadata): + def __init__(self, errors_metadata=None, agent_start_time=None, timer=None): + self.process_id = get_process_id() self.errors_metadata = errors_metadata + self.agent_start_time = agent_start_time + self.timer = timer def serialize_to_json(self): """ This needs to be compliant with agent debug info schema. - https://code.amazon.com/packages/SkySailProfileIonSchema/blobs/811cc0e7e406e37a5b878acf31468be3dcd2963d/--/src/main/resources/schema/DebugInfo.isl#L21 """ - return { - "errorsCount": self.errors_metadata.serialize_to_json() - } + json = {} + + self.add_agent_start_time(json) + self.add_process_id(json) + self.add_errors_metadata(json) + self.add_generic_metrics(json) + + return json + + def add_agent_start_time(self, json): + if self.agent_start_time is not None: + json["agentStartTime"] = int(self.agent_start_time) + + def add_errors_metadata(self, json): + if self.errors_metadata is not None: + json["errorsCount"] = self.errors_metadata.serialize_to_json() + + def add_process_id(self, json): + if self.process_id is not None: + json["processId"] = self.process_id + + def add_generic_metrics(self, json): + if self.timer is not None and self.timer.metrics: + generic_metrics = {} + + for metric in self.timer.metrics: + metric_value = self.timer.metrics[metric] + generic_metrics[metric + "_max"] = metric_value.max + generic_metrics[metric + "_average"] = metric_value.average() + + if generic_metrics: + json["genericMetrics"] = generic_metrics + + +def get_process_id(): + try: + return os.getpid() + except Exception as e: + logger.info("Failed to get the process id, " + repr(e)) + return None + diff --git a/codeguru_profiler_agent/local_aggregator.py b/codeguru_profiler_agent/local_aggregator.py index ee145b3..4d9f191 100644 --- a/codeguru_profiler_agent/local_aggregator.py +++ b/codeguru_profiler_agent/local_aggregator.py @@ -47,6 +47,7 @@ def __init__(self, reporter, environment=dict()): self.profile = None self.memory_limit_bytes = environment["memory_limit_bytes"] self.last_report_attempted = current_milli_time(clock=self.clock) + self.agent_start_time = int(current_milli_time(clock=self.clock)) self.reset() @@ -75,15 +76,15 @@ def _check_memory_limit(self): def reset(self): self.errors_metadata.reset() + self.timer.reset() self.profile = self.profile_factory( profiling_group_name=self.profiling_group_name, sampling_interval_seconds=AgentConfiguration.get().sampling_interval.total_seconds(), host_weight=self.host_weight, start=current_milli_time(clock=self.clock), - agent_debug_info=AgentDebugInfo(self.errors_metadata), + agent_debug_info=AgentDebugInfo(self.errors_metadata, self.agent_start_time, self.timer), clock=self.clock ) - self.timer.reset() @with_timer("flush") def flush(self, force=False, reset=True): diff --git a/codeguru_profiler_agent/utils/synchronization.py b/codeguru_profiler_agent/utils/synchronization.py deleted file mode 100644 index 1d68ed5..0000000 --- a/codeguru_profiler_agent/utils/synchronization.py +++ /dev/null @@ -1,15 +0,0 @@ -import functools -import threading - - -def synchronized(wrapped): - """The missing @synchronized decorator - - https://git.io/vydTA""" - _lock = threading.RLock() - - @functools.wraps(wrapped) - def _wrapper(*args, **kwargs): - with _lock: - return wrapped(*args, **kwargs) - return _wrapper \ No newline at end of file diff --git a/test/unit/agent_metadata/test_agent_debug_info.py b/test/unit/agent_metadata/test_agent_debug_info.py index 6d18f2e..1b0fa23 100644 --- a/test/unit/agent_metadata/test_agent_debug_info.py +++ b/test/unit/agent_metadata/test_agent_debug_info.py @@ -1,135 +1,151 @@ -import json - from codeguru_profiler_agent.agent_metadata.agent_debug_info import ErrorsMetadata, AgentDebugInfo +from codeguru_profiler_agent.metrics.timer import Timer class TestAgentDebugInfo: - class TestAgentDebugInfo: + def test_it_returns_json_with_error_counts(self): + errors_metadata = ErrorsMetadata() + errors_metadata.record_sdk_error("createProfilingGroupErrors") + subject = AgentDebugInfo(errors_metadata) + + serialized_json = subject.serialize_to_json() + assert serialized_json["errorsCount"] == { + "configureAgentErrors": 0, + "configureAgentRnfeAutoCreateEnabledErrors": 0, + "createProfilingGroupErrors": 1, + "errorsCount": 1, + "postAgentProfileErrors": 0, + "postAgentProfileRnfeAutoCreateEnabledErrors": 0, + "sdkClientErrors": 1 + } + + def test_it_returns_json_with_agent_start_time(self): + subject = AgentDebugInfo(agent_start_time=12345) + + serialized_json = subject.serialize_to_json() + assert serialized_json["agentStartTime"] == 12345 + + def test_it_returns_json_with_generic_metrics(self): + timer = Timer() + timer.record("metric1", 12345000) + timer.record("metric1", 12350000) + subject = AgentDebugInfo(timer=timer) + + serialized_json = subject.serialize_to_json() + assert serialized_json["genericMetrics"] == { + "metric1_max": 12350000, + "metric1_average": 12347500.0 + } + + +class TestErrorsMetadata: + class TestSerializeToJson: def test_it_returns_json_with_error_counts(self): - errors_metadata = ErrorsMetadata() - errors_metadata.record_sdk_error("createProfilingGroupErrors") - subject = AgentDebugInfo(errors_metadata) + subject = ErrorsMetadata() + subject.record_sdk_error("configureAgentErrors") assert subject.serialize_to_json() == { - "errorsCount": { - "configureAgentErrors": 0, - "configureAgentRnfeAutoCreateEnabledErrors": 0, - "createProfilingGroupErrors": 1, - "errorsCount": 1, - "postAgentProfileErrors": 0, - "postAgentProfileRnfeAutoCreateEnabledErrors": 0, - "sdkClientErrors": 1 - } + "errorsCount": 1, + "sdkClientErrors": 1, + "configureAgentErrors": 1, + "configureAgentRnfeAutoCreateEnabledErrors": 0, + "createProfilingGroupErrors": 0, + "postAgentProfileErrors": 0, + "postAgentProfileRnfeAutoCreateEnabledErrors": 0 } - class TestErrorsMetadata: - class TestSerializeToJson: - def test_it_returns_json_with_error_counts(self): - subject = ErrorsMetadata() - subject.record_sdk_error("configureAgentErrors") - - assert subject.serialize_to_json() == { - "errorsCount": 1, - "sdkClientErrors": 1, - "configureAgentErrors": 1, - "configureAgentRnfeAutoCreateEnabledErrors": 0, - "createProfilingGroupErrors": 0, - "postAgentProfileErrors": 0, - "postAgentProfileRnfeAutoCreateEnabledErrors": 0 - } - - class TestRecordSdkError: - def test_it_increments_error_count_when_configureAgentErrors(self): - subject = ErrorsMetadata() - subject.record_sdk_error("configureAgentErrors") - - assert subject.errors_count == 1 - assert subject.sdk_client_errors == 1 - assert subject.configure_agent_errors == 1 - assert subject.configure_agent_rnfe_auto_create_enabled_errors == 0 - assert subject.create_profiling_group_errors == 0 - assert subject.post_agent_profile_errors == 0 - assert subject.post_agent_profile_rnfe_auto_create_enabled_errors == 0 - - def test_it_increments_error_count_when_configureAgentRnfeAutoCreateEnabledErrors(self): - subject = ErrorsMetadata() - subject.record_sdk_error("configureAgentRnfeAutoCreateEnabledErrors") - - assert subject.errors_count == 1 - assert subject.sdk_client_errors == 1 - assert subject.configure_agent_errors == 0 - assert subject.configure_agent_rnfe_auto_create_enabled_errors == 1 - assert subject.create_profiling_group_errors == 0 - assert subject.post_agent_profile_errors == 0 - assert subject.post_agent_profile_rnfe_auto_create_enabled_errors == 0 - - def test_it_increments_error_count_when_createProfilingGroupErrors(self): - subject = ErrorsMetadata() - subject.record_sdk_error("createProfilingGroupErrors") - - assert subject.errors_count == 1 - assert subject.sdk_client_errors == 1 - assert subject.configure_agent_errors == 0 - assert subject.configure_agent_rnfe_auto_create_enabled_errors == 0 - assert subject.create_profiling_group_errors == 1 - assert subject.post_agent_profile_errors == 0 - assert subject.post_agent_profile_rnfe_auto_create_enabled_errors == 0 - - def test_it_increments_error_count_when_postAgentProfileErrors(self): - subject = ErrorsMetadata() - subject.record_sdk_error("postAgentProfileErrors") - - assert subject.errors_count == 1 - assert subject.sdk_client_errors == 1 - assert subject.configure_agent_errors == 0 - assert subject.configure_agent_rnfe_auto_create_enabled_errors == 0 - assert subject.create_profiling_group_errors == 0 - assert subject.post_agent_profile_errors == 1 - assert subject.post_agent_profile_rnfe_auto_create_enabled_errors == 0 - - def test_it_increments_error_count_when_postAgentProfileRnfeAutoCreateEnabledErrors(self): - subject = ErrorsMetadata() - subject.record_sdk_error("postAgentProfileRnfeAutoCreateEnabledErrors") - - assert subject.errors_count == 1 - assert subject.sdk_client_errors == 1 - assert subject.configure_agent_errors == 0 - assert subject.configure_agent_rnfe_auto_create_enabled_errors == 0 - assert subject.create_profiling_group_errors == 0 - assert subject.post_agent_profile_errors == 0 - assert subject.post_agent_profile_rnfe_auto_create_enabled_errors == 1 - - def test_it_increments_error_count_when_multiple_errors(self): - subject = ErrorsMetadata() - subject.record_sdk_error("createProfilingGroupErrors") - subject.record_sdk_error("postAgentProfileRnfeAutoCreateEnabledErrors") - - assert subject.errors_count == 2 - assert subject.sdk_client_errors == 2 - assert subject.configure_agent_errors == 0 - assert subject.configure_agent_rnfe_auto_create_enabled_errors == 0 - assert subject.create_profiling_group_errors == 1 - assert subject.post_agent_profile_errors == 0 - assert subject.post_agent_profile_rnfe_auto_create_enabled_errors == 1 - - def test_it_resets_error_count(self): - subject = ErrorsMetadata() - subject.record_sdk_error("createProfilingGroupErrors") - - assert subject.errors_count == 1 - assert subject.sdk_client_errors == 1 - assert subject.configure_agent_errors == 0 - assert subject.configure_agent_rnfe_auto_create_enabled_errors == 0 - assert subject.create_profiling_group_errors == 1 - assert subject.post_agent_profile_errors == 0 - assert subject.post_agent_profile_rnfe_auto_create_enabled_errors == 0 - - subject.reset() - - assert subject.errors_count == 0 - assert subject.sdk_client_errors == 0 - assert subject.configure_agent_errors == 0 - assert subject.configure_agent_rnfe_auto_create_enabled_errors == 0 - assert subject.create_profiling_group_errors == 0 - assert subject.post_agent_profile_errors == 0 - assert subject.post_agent_profile_rnfe_auto_create_enabled_errors == 0 + class TestRecordSdkError: + def test_it_increments_error_count_when_configureAgentErrors(self): + subject = ErrorsMetadata() + subject.record_sdk_error("configureAgentErrors") + + assert subject.errors_count == 1 + assert subject.sdk_client_errors == 1 + assert subject.configure_agent_errors == 1 + assert subject.configure_agent_rnfe_auto_create_enabled_errors == 0 + assert subject.create_profiling_group_errors == 0 + assert subject.post_agent_profile_errors == 0 + assert subject.post_agent_profile_rnfe_auto_create_enabled_errors == 0 + + def test_it_increments_error_count_when_configureAgentRnfeAutoCreateEnabledErrors(self): + subject = ErrorsMetadata() + subject.record_sdk_error("configureAgentRnfeAutoCreateEnabledErrors") + + assert subject.errors_count == 1 + assert subject.sdk_client_errors == 1 + assert subject.configure_agent_errors == 1 + assert subject.configure_agent_rnfe_auto_create_enabled_errors == 1 + assert subject.create_profiling_group_errors == 0 + assert subject.post_agent_profile_errors == 0 + assert subject.post_agent_profile_rnfe_auto_create_enabled_errors == 0 + + def test_it_increments_error_count_when_createProfilingGroupErrors(self): + subject = ErrorsMetadata() + subject.record_sdk_error("createProfilingGroupErrors") + + assert subject.errors_count == 1 + assert subject.sdk_client_errors == 1 + assert subject.configure_agent_errors == 0 + assert subject.configure_agent_rnfe_auto_create_enabled_errors == 0 + assert subject.create_profiling_group_errors == 1 + assert subject.post_agent_profile_errors == 0 + assert subject.post_agent_profile_rnfe_auto_create_enabled_errors == 0 + + def test_it_increments_error_count_when_postAgentProfileErrors(self): + subject = ErrorsMetadata() + subject.record_sdk_error("postAgentProfileErrors") + + assert subject.errors_count == 1 + assert subject.sdk_client_errors == 1 + assert subject.configure_agent_errors == 0 + assert subject.configure_agent_rnfe_auto_create_enabled_errors == 0 + assert subject.create_profiling_group_errors == 0 + assert subject.post_agent_profile_errors == 1 + assert subject.post_agent_profile_rnfe_auto_create_enabled_errors == 0 + + def test_it_increments_error_count_when_postAgentProfileRnfeAutoCreateEnabledErrors(self): + subject = ErrorsMetadata() + subject.record_sdk_error("postAgentProfileRnfeAutoCreateEnabledErrors") + + assert subject.errors_count == 1 + assert subject.sdk_client_errors == 1 + assert subject.configure_agent_errors == 0 + assert subject.configure_agent_rnfe_auto_create_enabled_errors == 0 + assert subject.create_profiling_group_errors == 0 + assert subject.post_agent_profile_errors == 1 + assert subject.post_agent_profile_rnfe_auto_create_enabled_errors == 1 + + def test_it_increments_error_count_when_multiple_errors(self): + subject = ErrorsMetadata() + subject.record_sdk_error("createProfilingGroupErrors") + subject.record_sdk_error("postAgentProfileRnfeAutoCreateEnabledErrors") + + assert subject.errors_count == 2 + assert subject.sdk_client_errors == 2 + assert subject.configure_agent_errors == 0 + assert subject.configure_agent_rnfe_auto_create_enabled_errors == 0 + assert subject.create_profiling_group_errors == 1 + assert subject.post_agent_profile_errors == 1 + assert subject.post_agent_profile_rnfe_auto_create_enabled_errors == 1 + + def test_it_resets_error_count(self): + subject = ErrorsMetadata() + subject.record_sdk_error("createProfilingGroupErrors") + + assert subject.errors_count == 1 + assert subject.sdk_client_errors == 1 + assert subject.configure_agent_errors == 0 + assert subject.configure_agent_rnfe_auto_create_enabled_errors == 0 + assert subject.create_profiling_group_errors == 1 + assert subject.post_agent_profile_errors == 0 + assert subject.post_agent_profile_rnfe_auto_create_enabled_errors == 0 + + subject.reset() + + assert subject.errors_count == 0 + assert subject.sdk_client_errors == 0 + assert subject.configure_agent_errors == 0 + assert subject.configure_agent_rnfe_auto_create_enabled_errors == 0 + assert subject.create_profiling_group_errors == 0 + assert subject.post_agent_profile_errors == 0 + assert subject.post_agent_profile_rnfe_auto_create_enabled_errors == 0 diff --git a/test/unit/sdk_reporter/test_sdk_reporter.py b/test/unit/sdk_reporter/test_sdk_reporter.py index c43bf51..c0d71c9 100644 --- a/test/unit/sdk_reporter/test_sdk_reporter.py +++ b/test/unit/sdk_reporter/test_sdk_reporter.py @@ -158,7 +158,7 @@ def test_errors_metadata_when_post_agent_profile_rnfe_error_auto_create_enabled( "configureAgentRnfeAutoCreateEnabledErrors": 0, "createProfilingGroupErrors": 0, "errorsCount": 1, - "postAgentProfileErrors": 0, + "postAgentProfileErrors": 1, "postAgentProfileRnfeAutoCreateEnabledErrors": 1, "sdkClientErrors": 1 } @@ -296,7 +296,7 @@ def test_errors_metadata_when_configure_agent_rnfe_error_auto_create_enabled(sel with self.client_stubber: self.subject.refresh_configuration() assert self.subject.errors_metadata.serialize_to_json() == { - "configureAgentErrors": 0, + "configureAgentErrors": 1, "configureAgentRnfeAutoCreateEnabledErrors": 1, "createProfilingGroupErrors": 0, "errorsCount": 1, From a33c28c604960eb4d21f52230da685848b161ec8 Mon Sep 17 00:00:00 2001 From: Parag Panda Date: Fri, 23 Apr 2021 04:29:30 -0700 Subject: [PATCH 3/6] Add back synchronization util --- codeguru_profiler_agent/utils/synchronization.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) create mode 100644 codeguru_profiler_agent/utils/synchronization.py diff --git a/codeguru_profiler_agent/utils/synchronization.py b/codeguru_profiler_agent/utils/synchronization.py new file mode 100644 index 0000000..37460be --- /dev/null +++ b/codeguru_profiler_agent/utils/synchronization.py @@ -0,0 +1,15 @@ +import functools +import threading + + +def synchronized(wrapped): + """The missing @synchronized decorator + + https://git.io/vydTA""" + _lock = threading.RLock() + + @functools.wraps(wrapped) + def _wrapper(*args, **kwargs): + with _lock: + return wrapped(*args, **kwargs) + return _wrapper From 3626cfd91362dada47db166cc4554de5d9cbfda6 Mon Sep 17 00:00:00 2001 From: Parag Panda Date: Fri, 23 Apr 2021 04:29:30 -0700 Subject: [PATCH 4/6] Add back synchronization util --- codeguru_profiler_agent/agent_metadata/agent_debug_info.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/codeguru_profiler_agent/agent_metadata/agent_debug_info.py b/codeguru_profiler_agent/agent_metadata/agent_debug_info.py index 5aac744..26d40de 100644 --- a/codeguru_profiler_agent/agent_metadata/agent_debug_info.py +++ b/codeguru_profiler_agent/agent_metadata/agent_debug_info.py @@ -1,6 +1,8 @@ import logging import os +from codeguru_profiler_agent.utils.synchronization import synchronized + logger = logging.getLogger(__name__) @@ -36,6 +38,7 @@ def serialize_to_json(self): "postAgentProfileRnfeAutoCreateEnabledErrors": self.post_agent_profile_rnfe_auto_create_enabled_errors } + @synchronized def increment_sdk_error(self, error_type): """ ErrorsCount is the umbrella of all the kinds of error we want to capture. Currently we have only SdkClientErrors From b2c8b895fd62ff3bd272e2dfaa5bfb41a9e646cd Mon Sep 17 00:00:00 2001 From: Parag Panda Date: Mon, 26 Apr 2021 15:09:10 -0700 Subject: [PATCH 5/6] Incorporating review comments of revision 2 --- .../agent_metadata/agent_debug_info.py | 12 ++++++------ codeguru_profiler_agent/local_aggregator.py | 2 +- codeguru_profiler_agent/utils/time.py | 2 +- test/unit/agent_metadata/test_agent_debug_info.py | 8 ++++---- 4 files changed, 12 insertions(+), 12 deletions(-) diff --git a/codeguru_profiler_agent/agent_metadata/agent_debug_info.py b/codeguru_profiler_agent/agent_metadata/agent_debug_info.py index 26d40de..b3f8112 100644 --- a/codeguru_profiler_agent/agent_metadata/agent_debug_info.py +++ b/codeguru_profiler_agent/agent_metadata/agent_debug_info.py @@ -2,6 +2,7 @@ import os from codeguru_profiler_agent.utils.synchronization import synchronized +from codeguru_profiler_agent.utils.time import to_iso logger = logging.getLogger(__name__) @@ -92,7 +93,7 @@ def serialize_to_json(self): def add_agent_start_time(self, json): if self.agent_start_time is not None: - json["agentStartTime"] = int(self.agent_start_time) + json["agentStartTime"] = to_iso(self.agent_start_time) def add_errors_metadata(self, json): if self.errors_metadata is not None: @@ -106,10 +107,9 @@ def add_generic_metrics(self, json): if self.timer is not None and self.timer.metrics: generic_metrics = {} - for metric in self.timer.metrics: - metric_value = self.timer.metrics[metric] - generic_metrics[metric + "_max"] = metric_value.max - generic_metrics[metric + "_average"] = metric_value.average() + for metric, metric_value in self.timer.metrics.items(): + generic_metrics[metric + "_timings_max"] = metric_value.max + generic_metrics[metric + "_timings_average"] = metric_value.average() if generic_metrics: json["genericMetrics"] = generic_metrics @@ -119,6 +119,6 @@ def get_process_id(): try: return os.getpid() except Exception as e: - logger.info("Failed to get the process id, " + repr(e)) + logger.info("Failed to get the process id", exc_info=True) return None diff --git a/codeguru_profiler_agent/local_aggregator.py b/codeguru_profiler_agent/local_aggregator.py index 4d9f191..1091c9b 100644 --- a/codeguru_profiler_agent/local_aggregator.py +++ b/codeguru_profiler_agent/local_aggregator.py @@ -47,7 +47,7 @@ def __init__(self, reporter, environment=dict()): self.profile = None self.memory_limit_bytes = environment["memory_limit_bytes"] self.last_report_attempted = current_milli_time(clock=self.clock) - self.agent_start_time = int(current_milli_time(clock=self.clock)) + self.agent_start_time = current_milli_time(clock=self.clock) self.reset() diff --git a/codeguru_profiler_agent/utils/time.py b/codeguru_profiler_agent/utils/time.py index 5ebb3ec..02feb30 100644 --- a/codeguru_profiler_agent/utils/time.py +++ b/codeguru_profiler_agent/utils/time.py @@ -5,7 +5,7 @@ def to_iso(epoch_milli): try: - return datetime.fromtimestamp(epoch_milli / 1000).isoformat() + return datetime.fromtimestamp(epoch_milli / 1000).isoformat() + "Z" # ISO 8601 date-time format except ValueError: return str(epoch_milli) diff --git a/test/unit/agent_metadata/test_agent_debug_info.py b/test/unit/agent_metadata/test_agent_debug_info.py index 1b0fa23..3728133 100644 --- a/test/unit/agent_metadata/test_agent_debug_info.py +++ b/test/unit/agent_metadata/test_agent_debug_info.py @@ -20,10 +20,10 @@ def test_it_returns_json_with_error_counts(self): } def test_it_returns_json_with_agent_start_time(self): - subject = AgentDebugInfo(agent_start_time=12345) + subject = AgentDebugInfo(agent_start_time=1577869200000) # 2020-01-01T01:00:00Z serialized_json = subject.serialize_to_json() - assert serialized_json["agentStartTime"] == 12345 + assert serialized_json["agentStartTime"] == "2020-01-01T01:00:00Z" def test_it_returns_json_with_generic_metrics(self): timer = Timer() @@ -33,8 +33,8 @@ def test_it_returns_json_with_generic_metrics(self): serialized_json = subject.serialize_to_json() assert serialized_json["genericMetrics"] == { - "metric1_max": 12350000, - "metric1_average": 12347500.0 + "metric1_timings_max": 12350000, + "metric1_timings_average": 12347500.0 } From 2fda8717f127332f517dcbb98bdbdeecf23e003c Mon Sep 17 00:00:00 2001 From: Parag Panda Date: Mon, 26 Apr 2021 19:27:40 -0700 Subject: [PATCH 6/6] Using utcfromtimestamp to get to_iso time to make sure local timezone doesn't affect datetime value --- codeguru_profiler_agent/utils/time.py | 5 ++++- test/unit/agent_metadata/test_agent_debug_info.py | 4 ++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/codeguru_profiler_agent/utils/time.py b/codeguru_profiler_agent/utils/time.py index 02feb30..a6f4095 100644 --- a/codeguru_profiler_agent/utils/time.py +++ b/codeguru_profiler_agent/utils/time.py @@ -3,11 +3,14 @@ import time from datetime import datetime + def to_iso(epoch_milli): try: - return datetime.fromtimestamp(epoch_milli / 1000).isoformat() + "Z" # ISO 8601 date-time format + return datetime.utcfromtimestamp(epoch_milli / 1000).isoformat( + timespec='milliseconds') + "Z" # ISO 8601 date-time format except ValueError: return str(epoch_milli) + def current_milli_time(clock=time.time): return int(clock() * 1000) diff --git a/test/unit/agent_metadata/test_agent_debug_info.py b/test/unit/agent_metadata/test_agent_debug_info.py index 3728133..27a748f 100644 --- a/test/unit/agent_metadata/test_agent_debug_info.py +++ b/test/unit/agent_metadata/test_agent_debug_info.py @@ -20,10 +20,10 @@ def test_it_returns_json_with_error_counts(self): } def test_it_returns_json_with_agent_start_time(self): - subject = AgentDebugInfo(agent_start_time=1577869200000) # 2020-01-01T01:00:00Z + subject = AgentDebugInfo(agent_start_time=1577840400000) # 2020-01-01T01:00:00Z UTC serialized_json = subject.serialize_to_json() - assert serialized_json["agentStartTime"] == "2020-01-01T01:00:00Z" + assert serialized_json["agentStartTime"] == "2020-01-01T01:00:00.000Z" def test_it_returns_json_with_generic_metrics(self): timer = Timer()