diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml new file mode 100644 index 0000000..6fb590e --- /dev/null +++ b/.github/workflows/python-package.yml @@ -0,0 +1,58 @@ +# This workflow will install Python dependencies, run tests and lint with a variety of Python versions +# For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions + +name: Python package + +on: + push: + branches: [ main ] + pull_request: + branches: [ main ] + +jobs: + build: + + runs-on: ${{ matrix.os }} + strategy: + matrix: + os: [ubuntu-latest, windows-latest, macos-latest] + python-version: ['3.6', '3.7', '3.8', '3.9'] + + steps: + - uses: actions/checkout@v2 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v2 + with: + python-version: ${{ matrix.python-version }} + - name: Install pip + run: | + python -m pip install --upgrade pip + - name: Install dependencies for running tests + run: | + python -m pip install flake8 pytest pytest-print + python -m pip install mock httpretty six pympler + - name: Install dependencies for additional checks + run: | + python -m pip install bandit + - name: Install dependencies from requirements + run: | + pip install -r requirements.txt + - name: Lint with flake8 + run: | + # stop the build if there are Python syntax errors or undefined names + flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics + # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide + flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics + - name: Run bandit + run: | + # run bandit to find common security issues + bandit -r codeguru_profiler_agent + - name: Run a specific test with logs + run: | + pytest -vv -o log_cli=true test/acceptance/test_live_profiling.py + - name: Run tests with pytest + run: | + pytest -vv + # For local testing, you can use pytest-html if you want a generated html report. + # python -m pip install pytest-html + # pytest -vv --html=pytest-report.html --self-contained-html diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..b15db01 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +__pycache__/ +pytest-report.html diff --git a/CHANGELOG.rst b/CHANGELOG.rst new file mode 100644 index 0000000..557ecbc --- /dev/null +++ b/CHANGELOG.rst @@ -0,0 +1,15 @@ +========= +CHANGELOG +========= + +1.0.1 (layer_v3) +=================== + +* Fix bug for running agent in Windows; Update module_path_extractor to support Windows applications +* Use json library instead of custom encoder for profile encoding for more reliable performance +* Specify min version for boto3 in setup.py + +1.0.0 (layer_v1, layer_v2) +========================== + +* Initial Release diff --git a/README.md b/README.md index 847260c..8941610 100644 --- a/README.md +++ b/README.md @@ -1,11 +1,10 @@ -## My Project +## Amazon CodeGuru Profiler Python Agent -TODO: Fill this README out! +For more details, check the documentation: https://docs.aws.amazon.com/codeguru/latest/profiler-ug/what-is-codeguru-profiler.html -Be sure to: +## Release to PyPI -* Change the title in this README -* Edit your repository description on GitHub +Use the `setup.py` script to create the archive. ## Security diff --git a/codeguru_profiler_agent/__init__.py b/codeguru_profiler_agent/__init__.py new file mode 100644 index 0000000..ae94626 --- /dev/null +++ b/codeguru_profiler_agent/__init__.py @@ -0,0 +1,12 @@ +""" +Modules +------- + +.. automodule:: codeguru_profiler_agent.profiler + :members: + +""" + +from . import * +from .profiler import Profiler +from .aws_lambda.profiler_decorator import with_lambda_profiler diff --git a/codeguru_profiler_agent/__main__.py b/codeguru_profiler_agent/__main__.py new file mode 100644 index 0000000..3d5efd1 --- /dev/null +++ b/codeguru_profiler_agent/__main__.py @@ -0,0 +1,99 @@ +import os +import sys +import runpy +import logging + +profiler = None + + +def _start_profiler(options, env): + """ + This will init the profiler object and start it. + :param options: options may contain profiling group name, region or credential profile if they are passed in command + :param env: the environment dict from which to search for variables (usually os.environ is passed) + :return: the profiler object + """ + from codeguru_profiler_agent.profiler_builder import build_profiler + global profiler + profiler = build_profiler(pg_name=options.profiling_group_name, region_name=options.region, + credential_profile=options.credential_profile, env=env) + if profiler is not None: + profiler.start() + return profiler + + +def _set_log_level(log_level): + if log_level is None: + return + numeric_level = getattr(logging, log_level.upper(), None) + if isinstance(numeric_level, int): + logging.basicConfig(level=numeric_level) + + +def main(input_args=sys.argv[1:], env=os.environ, start_profiler=_start_profiler): + from argparse import ArgumentParser + usage = 'python -m codeguru_profiler_agent [-p profilingGroupName] [-r region] [-c credentialProfileName]' \ + ' [-m module | scriptfile.py] [arg]' \ + + '...\nexample: python -m codeguru_profiler_agent -p myProfilingGroup hello_world.py' + parser = ArgumentParser(usage=usage) + parser.add_argument('-p', '--profiling-group-name', dest="profiling_group_name", + help='Name of the profiling group to send profiles into') + parser.add_argument('-r', '--region', dest="region", + help='Region in which you have created your profiling group. e.g. "us-west-2".' + + ' Default depends on your configuration' + + ' (see https://boto3.amazonaws.com/v1/documentation/api/latest/guide/configuration.html)') + parser.add_argument('-c', '--credential-profile-name', dest="credential_profile", + help='Name of the profile created in shared credential file used for submitting profiles. ' + + '(see https://boto3.amazonaws.com/v1/documentation/api/latest/guide/credentials.html#shared-credentials-file)') + parser.add_argument('-m', dest='module', action='store_true', + help='Profile a library module', default=False) + parser.add_argument('--log', dest='log_level', + help='Set log level, possible values: debug, info, warning, error and critical' + + ' (default is warning)') + parser.add_argument('scriptfile') + + (known_args, rest) = parser.parse_known_args(args=input_args) + + # Set the sys arguments to the remaining arguments (the one needed by the client script) if they were set. + sys.argv = sys.argv[:1] + if len(rest) > 0: + sys.argv += rest + + _set_log_level(known_args.log_level) + + if known_args.module: + code = "run_module(modname, run_name='__main__')" + globs = { + 'run_module': runpy.run_module, + 'modname': known_args.scriptfile + } + else: + script_name = known_args.scriptfile + sys.path.insert(0, os.path.dirname(script_name)) + with open(script_name, 'rb') as fp: + code = compile(fp.read(), script_name, 'exec') + globs = { + '__file__': script_name, + '__name__': '__main__', + '__package__': None, + '__cached__': None, + } + + # now start and stop profile around executing the user's code + if not start_profiler(known_args, env): + parser.print_usage() + try: + # Skip issue reported by Bandit. + # Issue: [B102:exec_used] Use of exec detected. + # https://bandit.readthedocs.io/en/latest/plugins/b102_exec_used.html + # We need exec(..) here to run the code from the customer. + # Only the code from the customer's script is executed and only inside the customer's environment, + # so the customer's code cannot be altered before it is executed. + exec(code, globs, None) # nosec + finally: + if profiler is not None: + profiler.stop() + + +if __name__ == "__main__": + main() diff --git a/codeguru_profiler_agent/agent_metadata/__init__.py b/codeguru_profiler_agent/agent_metadata/__init__.py new file mode 100644 index 0000000..b6e690f --- /dev/null +++ b/codeguru_profiler_agent/agent_metadata/__init__.py @@ -0,0 +1 @@ +from . import * diff --git a/codeguru_profiler_agent/agent_metadata/agent_metadata.py b/codeguru_profiler_agent/agent_metadata/agent_metadata.py new file mode 100644 index 0000000..2264c4f --- /dev/null +++ b/codeguru_profiler_agent/agent_metadata/agent_metadata.py @@ -0,0 +1,96 @@ +import json +from platform import python_version + +from codeguru_profiler_agent.agent_metadata.aws_ec2_instance import AWSEC2Instance +from codeguru_profiler_agent.agent_metadata.aws_fargate_task import AWSFargateTask +from codeguru_profiler_agent.agent_metadata.fleet_info import DefaultFleetInfo + + +# NOTE: Please do not alter the value for the following constants without the full knowledge of the use of them. +# These constants are used in several scripts, including setup.py. +__agent_name__ = "CodeGuruProfiler-python" +__agent_version__ = "1.0.1" + + +def look_up_fleet_info( + platform_metadata_fetchers=( + AWSEC2Instance.look_up_metadata, + AWSFargateTask.look_up_metadata + ) +): + for metadata_fetcher in platform_metadata_fetchers: + fleet_info = metadata_fetcher() + if fleet_info is not None: + return fleet_info + + return DefaultFleetInfo() + + +class AgentInfo: + PYTHON_AGENT = __agent_name__ + CURRENT_VERSION = __agent_version__ + + def __init__(self, agent_type=PYTHON_AGENT, version=CURRENT_VERSION): + self.agent_type = agent_type + self.version = version + + @classmethod + def default_agent_info(cls): + return cls() + + def __eq__(self, other): + if not isinstance(other, AgentInfo): + return False + + return self.agent_type == other.agent_type and self.version == other.version + + +class AgentMetadata: + """ + This is once instantianted in the profiler.py file, marked as environment variable and reused in the other parts. + When needed to override for testing other components, update those components to allow a default parameter for + agent_metadata, or use the environment["agent_metadata"]. + """ + def __init__(self, + fleet_info=None, + agent_info=AgentInfo.default_agent_info(), + runtime_version=python_version()): + self._fleet_info = fleet_info + self.agent_info = agent_info + self.runtime_version = runtime_version + self.json_rep = None + + @property + def fleet_info(self): + if self._fleet_info is None: + self._fleet_info = look_up_fleet_info() + return self._fleet_info + + def serialize_to_json(self, sample_weight, duration_ms, cpu_time_seconds, + average_num_threads, overhead_ms, memory_usage_mb): + """ + This needs to be compliant with agent profile schema. + """ + if self.json_rep is None: + self.json_rep = { + "sampleWeights": { + "WALL_TIME": sample_weight + }, + "durationInMs": duration_ms, + "fleetInfo": self.fleet_info.serialize_to_map(), + "agentInfo": { + "type": self.agent_info.agent_type, + "version": self.agent_info.version + }, + "agentOverhead": { + "memory_usage_mb": memory_usage_mb + }, + "runtimeVersion": self.runtime_version, + "cpuTimeInSeconds": cpu_time_seconds, + "metrics": { + "numThreads": average_num_threads + } + } + if overhead_ms != 0: + self.json_rep["agentOverhead"]["timeInMs"] = overhead_ms + return self.json_rep diff --git a/codeguru_profiler_agent/agent_metadata/aws_ec2_instance.py b/codeguru_profiler_agent/agent_metadata/aws_ec2_instance.py new file mode 100644 index 0000000..28fd2c7 --- /dev/null +++ b/codeguru_profiler_agent/agent_metadata/aws_ec2_instance.py @@ -0,0 +1,54 @@ +import logging +from codeguru_profiler_agent.utils.log_exception import log_exception +from codeguru_profiler_agent.agent_metadata.fleet_info import FleetInfo, http_get + +# Currently, there is not a utility function in boto3 to retrieve the instance metadata; hence we would need +# get the metadata through URI. +# See https://github.com/boto/boto3/issues/313 for tracking the work for supporting such function in boto3 +DEFAULT_EC2_METADATA_URI = "http://169.254.169.254/latest/meta-data/" +EC2_HOST_NAME_URI = DEFAULT_EC2_METADATA_URI + "local-hostname" +EC2_HOST_INSTANCE_TYPE_URI = DEFAULT_EC2_METADATA_URI + "instance-type" + +logger = logging.getLogger(__name__) + +class AWSEC2Instance(FleetInfo): + """ + This class will get and parse the EC2 metadata if available. + See https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html. + """ + + def __init__(self, host_name, host_type): + super().__init__() + self.host_name = host_name + self.host_type = host_type + + def get_fleet_instance_id(self): + return self.host_name + + @classmethod + def __look_up_host_name(cls): + # The id of the fleet element. Eg. host name in ec2. + return http_get(url=EC2_HOST_NAME_URI).read().decode() + + @classmethod + def __look_up_instance_type(cls): + return http_get(url=EC2_HOST_INSTANCE_TYPE_URI).read().decode() + + @classmethod + def look_up_metadata(cls): + try: + return cls( + host_name=cls.__look_up_host_name(), + host_type=cls.__look_up_instance_type() + ) + except Exception: + log_exception(logger, "Unable to get Ec2 instance metadata, this is normal when running in a different " + "environment (e.g. Fargate), profiler will still work") + return None + + def serialize_to_map(self): + return { + "computeType": "aws_ec2_instance", + "hostName": self.host_name, + "hostType": self.host_type + } diff --git a/codeguru_profiler_agent/agent_metadata/aws_fargate_task.py b/codeguru_profiler_agent/agent_metadata/aws_fargate_task.py new file mode 100644 index 0000000..5ec8c05 --- /dev/null +++ b/codeguru_profiler_agent/agent_metadata/aws_fargate_task.py @@ -0,0 +1,64 @@ +import logging +import os +import json +from codeguru_profiler_agent.agent_metadata.fleet_info import FleetInfo, http_get + +# https://docs.amazonaws.cn/en_us/AmazonECS/latest/developerguide/task-metadata-endpoint-v4.html +ECS_CONTAINER_METADATA_URI = "ECS_CONTAINER_METADATA_URI_V4" + +logger = logging.getLogger(__name__) + +class AWSFargateTask(FleetInfo): + """ + This class will get and parse the Fargate metadata if available. + Note, limit can be None if no resource limit is defined. + See https://docs.aws.amazon.com/AmazonECS/latest/developerguide/task-metadata-endpoint-v3.html + """ + + def __init__(self, task_arn, cpu_limit, memory_limit_in_mb): + super().__init__() + self.task_arn = task_arn + self.cpu_limit = cpu_limit + self.memory_limit_in_mb = memory_limit_in_mb + + def get_fleet_instance_id(self): + return self.task_arn + + @classmethod + def __look_up_metadata(cls, url): + return json.load(http_get(url=url + "/task")) + + + @classmethod + def look_up_metadata(cls, url=os.getenv(ECS_CONTAINER_METADATA_URI)): + """ + The hardcoded keys below are described on + https://docs.aws.amazon.com/AmazonECS/latest/developerguide/task-metadata-endpoint-v3.html + """ + if not url: + return None + try: + metadata = cls.__look_up_metadata(url=url) + + return cls( + task_arn=metadata["TaskARN"], + cpu_limit=metadata.get("Limits", {}).get("CPU"), + memory_limit_in_mb=metadata.get("Limits", {}).get("Memory") + ) + except Exception: + # if we are not in a fargate environment we expect to have url being None so we expect to return before, + # catching an exception here should be rare so we do print the stack trace. + logger.info("Unable to get Fargate instance metadata, this is normal when running in a different " + "environment (e.g. local dev machine), profiler will still work: ", exc_info=True) + return None + + def serialize_to_map(self): + fleet_info_in_map = { + "computeType": "aws_fargate_task", + "taskArn": self.task_arn + } + if self.cpu_limit is not None: + fleet_info_in_map["cpuLimit"] = self.cpu_limit + if self.memory_limit_in_mb is not None: + fleet_info_in_map["memoryLimitInMB"] = self.memory_limit_in_mb + return fleet_info_in_map diff --git a/codeguru_profiler_agent/agent_metadata/aws_lambda.py b/codeguru_profiler_agent/agent_metadata/aws_lambda.py new file mode 100644 index 0000000..0056b15 --- /dev/null +++ b/codeguru_profiler_agent/agent_metadata/aws_lambda.py @@ -0,0 +1,110 @@ +import os +import logging +import uuid + +from codeguru_profiler_agent.agent_metadata.fleet_info import FleetInfo +from codeguru_profiler_agent.aws_lambda.lambda_context import LambdaContext + +logger = logging.getLogger(__name__) + +LAMBDA_MEMORY_SIZE_ENV = "AWS_LAMBDA_FUNCTION_MEMORY_SIZE" +LAMBDA_EXECUTION_ENV = "AWS_EXECUTION_ENV" + +# Those are used for the configure agent call: +# See https://docs.aws.amazon.com/codeguru/latest/profiler-api/API_ConfigureAgent.html +# note how these are not consistent with the profile schema which is unfortunate +COMPUTE_PLAFORM_KEY = "ComputePlatform" +COMPUTE_PLAFORM_VALUE = "AWSLambda" +AGENT_ID_KEY = "AgentId" +AWS_REQUEST_ID_KEY = "AwsRequestId" +EXECUTION_ENVIRONMENT_KEY = "ExecutionEnvironment" +LAMBDA_FUNCTION_ARN_KEY = "LambdaFunctionArn" +LAMBDA_MEMORY_LIMIT_IN_MB_KEY = "LambdaMemoryLimitInMB" +LAMBDA_PREVIOUS_EXECUTION_TIME_IN_MILLISECONDS_KEY = "LambdaPreviousExecutionTimeInMilliseconds" +LAMBDA_REMAINING_TIME_IN_MILLISECONDS_KEY = "LambdaRemainingTimeInMilliseconds" +# LambdaTimeGapBetweenInvokesInMilliseconds not sent at the moment + +class AWSLambda(FleetInfo): + """ + This class will get and parse the lambda metadata from the environment. For details about available env vars, + See https://docs.aws.amazon.com/lambda/latest/dg/configuration-envvars.html + """ + def __init__(self, function_arn, memory_limit_mb, execution_env, agent_id=None): + super().__init__() + self.function_arn = function_arn + self.memory_limit_mb = memory_limit_mb + self.execution_env = execution_env + self.agent_id = agent_id or str(uuid.uuid4()) + + def get_fleet_instance_id(self): + return self.function_arn + + @classmethod + def __look_up_memory_limit(cls, env=os.environ): + try: + return int(env.get(LAMBDA_MEMORY_SIZE_ENV)) + except (TypeError, ValueError): + return None + + @classmethod + def __look_execution_env(cls, env=os.environ): + return env.get(LAMBDA_EXECUTION_ENV) + + @classmethod + def __look_function_arn(cls, context): + return context.invoked_function_arn + + @classmethod + def look_up_metadata(cls, context, env=os.environ): + """ + Either the account_id or context parameter should be provided + """ + try: + return cls( + function_arn=cls.__look_function_arn(context), + memory_limit_mb=cls.__look_up_memory_limit(env), + execution_env=cls.__look_execution_env(env) + ) + except Exception: + logger.info("Unable to get Lambda metadata", exc_info=True) + return None + + def serialize_to_map(self): + as_map = { + "computeType": "aws_lambda", + "functionArn": self.function_arn + } + if self.memory_limit_mb: + as_map["memoryLimitInMB"] = self.memory_limit_mb + if self.execution_env: + as_map["executionEnv"] = self.execution_env + return as_map + + def get_metadata_for_configure_agent_call(self, lambda_context=None): + """ + This gathers metadata from self and from given lambda context to build a map used for the configure_agent call + :param lambda_context: a LambdaContext object which contains mainly the context from lambda framework. + See https://docs.aws.amazon.com/lambda/latest/dg/python-context.html for details about the context. + :return: a map with all metadata we want to send in configure_agent call. + """ + # get the singleton lambda context. The decorator should set it. + if lambda_context is None: + lambda_context = LambdaContext.get() + + as_map = { + COMPUTE_PLAFORM_KEY: COMPUTE_PLAFORM_VALUE, + LAMBDA_FUNCTION_ARN_KEY: self.function_arn, + AGENT_ID_KEY: self.agent_id + } + if self.memory_limit_mb: + as_map[LAMBDA_MEMORY_LIMIT_IN_MB_KEY] = str(self.memory_limit_mb) + if self.execution_env: + as_map[EXECUTION_ENVIRONMENT_KEY] = self.execution_env + if lambda_context.context is not None: + as_map[AWS_REQUEST_ID_KEY] = lambda_context.context.aws_request_id + as_map[LAMBDA_REMAINING_TIME_IN_MILLISECONDS_KEY] = \ + str(lambda_context.context.get_remaining_time_in_millis()) + if lambda_context.last_execution_duration: + as_map[LAMBDA_PREVIOUS_EXECUTION_TIME_IN_MILLISECONDS_KEY] = \ + str(int(lambda_context.last_execution_duration.total_seconds() * 1000)) + return as_map diff --git a/codeguru_profiler_agent/agent_metadata/fleet_info.py b/codeguru_profiler_agent/agent_metadata/fleet_info.py new file mode 100644 index 0000000..58a5152 --- /dev/null +++ b/codeguru_profiler_agent/agent_metadata/fleet_info.py @@ -0,0 +1,60 @@ +from abc import ABCMeta, abstractmethod +import uuid +from urllib import request +import os + +METADATA_URI_TIMEOUT_SECONDS = 3 + +def http_get(url): + # request.urlopen has been flagged as risky because it can be used to open local files if url starts with + # file://, to protect us from that we add a check in the passed url. + # With this check we can tell bandit (static analysis tool) to ignore this error with #nosec + if not url.startswith("http"): + raise ValueError("url for metadata is not a valid http address. We will not try to get metadata") + return request.urlopen(url, timeout=METADATA_URI_TIMEOUT_SECONDS) # nosec + + +class FleetInfo(metaclass=ABCMeta): # pragma: no cover + def __init__(self): + pass + + """ + Returns the id of the fleet element, that is used to tell if multiple agents come from the same fleet element. + This id can be the hostname for an EC2 or the task ARN for Fargate. + @return the id in string. + """ + @abstractmethod + def get_fleet_instance_id(self): + pass + + @abstractmethod + def serialize_to_map(self): + pass + + def get_metadata_for_configure_agent_call(self): + """ + Used for the configure_agent call, the default is to send nothing. Specifics are sent for Lambda. + """ + return None + + +class DefaultFleetInfo(FleetInfo): + + def __init__(self): + self.fleet_instance_id = str(uuid.uuid4()) + try: + # sched_getaffinity gives the number of logical cpus that the process can use on Unix systems. + # If not available, we default to the cpu_count(). + self.vCPUs = len(os.sched_getaffinity(0)) + except AttributeError: + self.vCPUs = os.cpu_count() + + def get_fleet_instance_id(self): + return self.fleet_instance_id + + def serialize_to_map(self): + return { + "id": self.fleet_instance_id, + "type": "UNKNOWN", + "vCPUs": self.vCPUs + } diff --git a/codeguru_profiler_agent/aws_lambda/__init__.py b/codeguru_profiler_agent/aws_lambda/__init__.py new file mode 100644 index 0000000..b6e690f --- /dev/null +++ b/codeguru_profiler_agent/aws_lambda/__init__.py @@ -0,0 +1 @@ +from . import * diff --git a/codeguru_profiler_agent/aws_lambda/lambda_context.py b/codeguru_profiler_agent/aws_lambda/lambda_context.py new file mode 100644 index 0000000..844cd7f --- /dev/null +++ b/codeguru_profiler_agent/aws_lambda/lambda_context.py @@ -0,0 +1,23 @@ +from datetime import timedelta + +_singleton = None + + +class LambdaContext: + """ + This class contains the contextual data about AWS lambda execution + It is kept updated by the profiler decorator when a lambda function is called + It has a singleton pattern to make it available from anywhere for convenience. + """ + def __init__(self): + self.context = None + self.last_execution_duration = timedelta() + + @classmethod + def get(cls): + global _singleton + if _singleton is None: + _singleton = LambdaContext() + return _singleton + + diff --git a/codeguru_profiler_agent/aws_lambda/lambda_handler.py b/codeguru_profiler_agent/aws_lambda/lambda_handler.py new file mode 100644 index 0000000..aebf76c --- /dev/null +++ b/codeguru_profiler_agent/aws_lambda/lambda_handler.py @@ -0,0 +1,53 @@ +import os +import logging +from codeguru_profiler_agent.aws_lambda.profiler_decorator import with_lambda_profiler + +HANDLER_ENV_NAME = "_HANDLER" +HANDLER_ENV_NAME_FOR_CODEGURU = "HANDLER_ENV_NAME_FOR_CODEGURU" +logger = logging.getLogger(__name__) + + +def restore_handler_env(original_handler, env=os.environ): + env[HANDLER_ENV_NAME] = original_handler + + +def load_handler(bootstrap_module, env=os.environ, original_handler_env_key=HANDLER_ENV_NAME_FOR_CODEGURU): + try: + original_handler_name = env.get(original_handler_env_key) + if not original_handler_name: + raise ValueError("Could not find module and function name from " + HANDLER_ENV_NAME_FOR_CODEGURU + + " environment variable") + + # Delegate to the lambda code to load the customer's module. + if hasattr(bootstrap_module, '_get_handler'): + customer_handler_function = bootstrap_module._get_handler(original_handler_name) + else: + # Support for python 3.6 bootstrap which is different. It is starting to be quite hacky at this point. + # We will need to discuss if it is worth handling this case and if we can do it in a better way. + # This returns both a init_handler and the function, we apply the init right away as we are in init process + init_handler, customer_handler_function = bootstrap_module._get_handlers( + handler=original_handler_name, + mode='event', # with 'event' it will return the function as is (handlerfn in the lambda code) + # 'http' would return wsgi.handle_one(sockfd, ('localhost', 80), handlerfn) instead + invokeid='unknown_id') # FIXME invokeid is used for error handling, need to see if we can get it + init_handler() + restore_handler_env(original_handler_name, env) + return customer_handler_function + except: + # Using print here as logger would probably not be properly initiated at this point. + print("Could not load the handler function to decorate it with CodeGuru Profiler." + + " If the exception error does not help you, please try removing the CodeGuru Profiler layer" + + " from your lambda function and contact AWS support.") + raise + + +# Load the customer's handler, this should be done at import time which means it is done when lambda frameworks +# loads our module. We load the bootstrap module by string name so that IDE does not complain +lambda_bootstrap_module = __import__("bootstrap") +handler_function = load_handler(lambda_bootstrap_module) + + +# WARNING: Do not rename this file, this function or HANDLER_ENV_NAME_FOR_CODEGURU without changing the bootstrap script +@with_lambda_profiler() +def call_handler(event, context): + return handler_function(event, context) diff --git a/codeguru_profiler_agent/aws_lambda/profiler_decorator.py b/codeguru_profiler_agent/aws_lambda/profiler_decorator.py new file mode 100644 index 0000000..4c8ad40 --- /dev/null +++ b/codeguru_profiler_agent/aws_lambda/profiler_decorator.py @@ -0,0 +1,86 @@ +import os +from datetime import datetime +from codeguru_profiler_agent.aws_lambda.lambda_context import LambdaContext + +_profiler = None + + +def _create_lambda_profiler(profiling_group_name, region_name, environment_override, context, env=os.environ): + """ + Calls build_profiler module to create the profiler object. If we fail to create it we return a no-op profiler + so that we don't even go through this method again. + """ + from codeguru_profiler_agent.profiler_builder import build_profiler + from codeguru_profiler_agent.agent_metadata.agent_metadata import AgentMetadata + from codeguru_profiler_agent.agent_metadata.aws_lambda import AWSLambda + override = {'agent_metadata': AgentMetadata(AWSLambda.look_up_metadata(context))} + override.update(environment_override) + profiler = build_profiler(pg_name=profiling_group_name, region_name=region_name, override=override, env=env) + if profiler is None: + return _EmptyProfiler() + return profiler + + +def with_lambda_profiler(profiling_group_name=None, region_name=None, environment_override=dict(), + profiler_factory=_create_lambda_profiler, env=os.environ): + """ + Adds profiler start and pause calls around given function execution. + start() and pause() should never throw exceptions. + :param profiling_group_name: name of the profiling group where the profiles will be stored. + :param region_name: AWS Region to report to, given profiling group name must exist in that region. Note + that this value overwrites what is used in aws_session. If not provided, boto3 will search + configuration for the region. (e.g. "us-west-2") + :param environment_override: custom dependency container dictionary. allows custom behavior to be injected. + See Profiler class for details. + """ + def function_decorator(function): + def profiler_decorate(event, context): + start_time = datetime.now() + global _profiler + if _profiler is None: + _profiler = profiler_factory(profiling_group_name=profiling_group_name, + region_name=region_name, + environment_override=environment_override, + context=context, env=env) + LambdaContext.get().context = context + if not _profiler.start(): + # if start() failed, there is high chance it will fail again + # so we disable the profiler to prevent further attempts. + _profiler = _EmptyProfiler() + try: + return function(event, context) + finally: + LambdaContext.get().last_execution_duration = datetime.now() - start_time + _profiler.pause() + + return profiler_decorate + + return function_decorator + + +def clear_static_profiler(): + """ + Used for unit tests + """ + global _profiler + if _profiler is not None: + _profiler.stop() + _profiler = None + + +class _EmptyProfiler: + """ + This class implements the public interface of Profiler but is doing nothing + """ + + def start(self, block=False): + return True + + def pause(self, block=False): + return True + + def is_running(self): + return False + + def stop(self): + return True diff --git a/codeguru_profiler_agent/codeguru_client_builder.py b/codeguru_profiler_agent/codeguru_client_builder.py new file mode 100644 index 0000000..bb9b767 --- /dev/null +++ b/codeguru_profiler_agent/codeguru_client_builder.py @@ -0,0 +1,97 @@ +""" +This module handles creating the CodeGuru Profiler client. +""" +import boto3 +import logging + +from botocore.config import Config + +logger = logging.getLogger(__name__) + + +class CodeGuruClientBuilder: + """ + Utility class that can create the CodeGuru Profiler sdk client. + The goal is to make sure it is created only when needed, hopefully inside the profiler runner so the customer's code + is not blocked waiting for the sdk client to be built. + This is also useful to share the client between the reporter and the orchestrator. + The client can be provided directly in the environment with key 'codeguru_profiler_client' this can used for + testing or customization. + If no client is provided explicitly, it creates a client with the boto3.Session object either provided directly + by customer or the default aws session will be used. + """ + + def __init__(self, environment): + self._provided_endpoint_url = environment.get("endpoint_url") + self._codeguru_client_instance = environment.get("codeguru_profiler_client") + self._aws_session = environment.get("aws_session") + self._region_name = environment.get("region_name") + + @property + def codeguru_client(self): + """ + Creates an client instance the first time it is called. + :return: a codeguru client object from the AWS SDK + """ + if self._codeguru_client_instance is None: + logger.debug("Initializing an instance of a codeguru client.") + self._codeguru_client_instance = self._create_codeguru_client() + return self._codeguru_client_instance + + def _create_codeguru_client(self): + region = self._get_region_or_default() + + # the default retry mode is 'legacy', not 'standard' + standard_config = Config( + retries={ + 'mode': 'standard' + } + ) + return self._get_session_or_default().client( + service_name='codeguruprofiler', + region_name=region, + endpoint_url=self._provided_endpoint_url, + config=standard_config) + + def _get_region_or_default(self): + """ + Creating a AWS client can fail with NoRegionError if there was no way for the sdk to detect the region + If the region is not specified in the client constructor through providing boto3.Session, the sdk would + search in ~/.aws/config or AWS_DEFAULT_REGION + Any user running in an EC2, lambda, fargate... etc would have this set somehow but to prevent the agent from + crashing for customers running on a different environment we will set region to IAD with a warning if there is + not one defined. + """ + region_from_session = self._get_session_or_default().region_name + if self._region_name is not None: + if region_from_session is not None and self._region_name != region_from_session: + logger.info("Different region settings were detected - session: {region_from_session}, " + "parameter: {region_from_param}. By default, client will be created with region " + "specified in parameter: {region_from_param}" + .format(region_from_session=region_from_session, region_from_param=self._region_name)) + return self._region_name + + # let the SDK search for the region... + if region_from_session is not None: + return region_from_session + + # No region has been set, creating an AWS client will fail in this case, set region to IAD by default. + # Debatable if we want to do this best effort here. We could simply let the boto3 client raise an exception + # when it finds that there is no way to chose the region. I decided to do this for our agent not to block the + # process for users running it in local environment, even if it will probably not work unless their profiling + # group happens to be in IAD. This also saves us from having to set a region everywhere in unit tests. + logger.info( + "AWS SDK was not able to detect the region either from config file or environment variable, " + + "default region will be set to us-east-1. Make sure you set the region_name parameter in the " + + "Profiler constructor to the region in which you created your profiling group. " + + "See https://boto3.amazonaws.com/v1/documentation/api/latest/guide/configuration.html for other ways." + ) + return "us-east-1" + + def _get_session_or_default(self): + """ + Creates a default boto3 session if it was not provided in init. + """ + if not self._aws_session: + self._aws_session = boto3.session.Session() + return self._aws_session diff --git a/codeguru_profiler_agent/file_reporter/__init__.py b/codeguru_profiler_agent/file_reporter/__init__.py new file mode 100644 index 0000000..b6e690f --- /dev/null +++ b/codeguru_profiler_agent/file_reporter/__init__.py @@ -0,0 +1 @@ +from . import * diff --git a/codeguru_profiler_agent/file_reporter/file_reporter.py b/codeguru_profiler_agent/file_reporter/file_reporter.py new file mode 100644 index 0000000..d9c58c0 --- /dev/null +++ b/codeguru_profiler_agent/file_reporter/file_reporter.py @@ -0,0 +1,52 @@ +import logging +import datetime + +from codeguru_profiler_agent.reporter.reporter import Reporter +from codeguru_profiler_agent.sdk_reporter.profile_encoder import ProfileEncoder + +logger = logging.getLogger(__name__) + + +class FileReporter(Reporter): + """ + Writes JSON-encoded profiles to a file; this is used for testing purposes. + """ + + _FILE_SUFFIX = ".json" + + def __init__(self, environment=dict()): + """ + :param environment: dependency container dictionary for the current profiler + :param file_prefix: (required inside environment) path + file prefix to use for profile reports + """ + self._file_prefix = environment["file_prefix"] + self._profile_encoder = \ + environment.get("profile_encoder") or ProfileEncoder(gzip=False, environment=environment) + + def setup(self): + """ + File reporter has static configuration, no expensive resources to be initialized. + """ + pass + + def refresh_configuration(self): + """ + File reporter has static configuration, no refresh. + """ + pass + + def report(self, profile, agent_metadata=None, timestamp=datetime.datetime.now()): + output_filename = self._output_filename_for(timestamp) + + logger.info("Writing profile to '{}'".format(output_filename)) + + with open(output_filename, 'wb') as output_file_stream: + self._profile_encoder.encode( + profile=profile, output_stream=output_file_stream) + + return output_filename + + def _output_filename_for(self, timestamp): + return self._file_prefix \ + + timestamp.strftime("%Y-%m-%dT%H-%M-%S") \ + + self._FILE_SUFFIX diff --git a/codeguru_profiler_agent/local_aggregator.py b/codeguru_profiler_agent/local_aggregator.py new file mode 100644 index 0000000..76795b2 --- /dev/null +++ b/codeguru_profiler_agent/local_aggregator.py @@ -0,0 +1,127 @@ +import logging +import time +import datetime + +from codeguru_profiler_agent.reporter.agent_configuration import AgentConfiguration +from codeguru_profiler_agent.metrics.with_timer import with_timer +from codeguru_profiler_agent.model.profile import Profile +from codeguru_profiler_agent.utils.time import current_milli_time + +logger = logging.getLogger(__name__) + + +class LocalAggregator: + """ + The local aggregator receives samples reported by the sampler and aggregates them into one profile. + When flush is called, the aggregator will first check whether the profile should be flushed or not (e.g. + whether the reporting interval is reached or not) and the profile will be reported through the configured reporter. + Each time we sample, the aggregator will check if the memory usage of the profile exceeds the memory limit. + If memory limit is violated, a force_flush will be executed first; if the last flush action happened within + the minimum time for reporting, OverMemoryLimitException will be raised. + """ + + def __init__(self, reporter, environment=dict()): + """ + :param reporter: reporter object used for reporting profiles + :param environment: dependency container dictionary for the current profiler + :param profiling_group_name: (required inside environment) name of the profiling group + :param memory_limit_bytes: (required inside environment) memory limit (Bytes) for profiler + :param host_weight: (required inside environment) scale factor used to rescale the profile collected in this + host to make the profile representative of the whole fleet + :param timer: (required inside environment) timer to be used for metrics + :param profile_factory: (inside environment) the factory to created profiler; default Profile. + :param clock: (inside environment) clock to be used; default is time.time + """ + self.reporter = reporter + self.profiling_group_name = environment["profiling_group_name"] + self.host_weight = environment["host_weight"] + self.timer = environment["timer"] + + self.profile_factory = environment.get("profile_factory") or Profile + self.clock = environment.get("clock") or time.time + + self.profile = None + self.memory_limit_bytes = environment["memory_limit_bytes"] + self.last_report_attempted = current_milli_time(clock=self.clock) + + self._reset() + + def add(self, sample): + """ + Aggregate reported sample into the profile + :param sample: Sample instance reported by Sampler + """ + self._aggregate_sample(sample) + self._check_memory_limit() + + def setup(self): + self.reporter.setup() + + @with_timer("aggregateThreadDumps") + def _aggregate_sample(self, sample): + self.profile.add(sample) + + def _check_memory_limit(self): + if self.profile.get_memory_usage_bytes() > self.memory_limit_bytes: + if self._is_under_min_reporting_time( + current_milli_time(clock=self.clock)): + raise OverMemoryLimitException( + "Profiler memory usage limit has been reached") + self.flush(force=True) + + def _reset(self): + self.profile = self.profile_factory( + profiling_group_name=self.profiling_group_name, + sampling_interval_seconds=AgentConfiguration.get().sampling_interval.total_seconds(), + host_weight=self.host_weight, + start=current_milli_time(clock=self.clock), + clock=self.clock + ) + self.timer.reset() + + @with_timer("flush") + def flush(self, force=False): + now = current_milli_time(clock=self.clock) + reported = False + if not force and not self._is_over_reporting_interval(now): + return False + + if self._is_under_min_reporting_time(now): + logger.info("Dropping the profile as it is under the minimum reporting time") + else: + self._report_profile(now) + reported = True + + if force or reported: + self._reset() + return reported + + def refresh_configuration(self): + self.reporter.refresh_configuration() + + def _report_profile(self, now): + self.last_report_attempted = now + self._add_overhead_metric_to_profile() + self.profile.end = now + logger.info("Attempting to report profile data: " + str(self.profile)) + if self.profile.is_empty(): + logger.info("Report was cancelled because it was empty") + return False + return self.reporter.report(self.profile) + + def _is_under_min_reporting_time(self, now): + return AgentConfiguration.get().is_under_min_reporting_time(now - self.last_report_attempted) + + def _is_over_reporting_interval(self, now): + return AgentConfiguration.get().is_over_reporting_interval(now - self.last_report_attempted) + + def _add_overhead_metric_to_profile(self): + run_profiler_metric = self.timer and self.timer.get_metric("runProfiler") + + # By default, if overhead_ms is 0 in Profile, it will be treated as overhead not available during encoding. + run_profiler_metric_seconds = run_profiler_metric.total if run_profiler_metric else 0 + self.profile.set_overhead_ms(duration_timedelta=datetime.timedelta(seconds=run_profiler_metric_seconds)) + + +class OverMemoryLimitException(Exception): + pass diff --git a/codeguru_profiler_agent/metrics/__init__.py b/codeguru_profiler_agent/metrics/__init__.py new file mode 100644 index 0000000..b6e690f --- /dev/null +++ b/codeguru_profiler_agent/metrics/__init__.py @@ -0,0 +1 @@ +from . import * diff --git a/codeguru_profiler_agent/metrics/metric.py b/codeguru_profiler_agent/metrics/metric.py new file mode 100644 index 0000000..af0e432 --- /dev/null +++ b/codeguru_profiler_agent/metrics/metric.py @@ -0,0 +1,19 @@ +class Metric: + def __init__(self): + self.counter = 0 + self.total = 0 + self.max = 0 + + def add(self, value): + self.counter += 1 + self.total += value + if self.max < value: + self.max = value + + def average(self): + return 0 if self.counter == 0 else self.total / self.counter + + def __repr__(self): + return "{}@{:#04x}(counter={}, total={:.5f}, max={:.5f}, average={:.5f})".format( + self.__class__.__name__, id(self), self.counter, self.total, + self.max, self.average()) diff --git a/codeguru_profiler_agent/metrics/timer.py b/codeguru_profiler_agent/metrics/timer.py new file mode 100644 index 0000000..2d51aa1 --- /dev/null +++ b/codeguru_profiler_agent/metrics/timer.py @@ -0,0 +1,23 @@ +from codeguru_profiler_agent.metrics.metric import Metric + + +class Timer: + """ + Keeps all metrics collected during Profiler execution. + At the moment these metrics are collected but not reported to the backend. + """ + + def __init__(self): + self.metrics = {} + + def record(self, name, value): + metric = self.metrics.get(name) + if metric is None: + metric = self.metrics[name] = Metric() + metric.add(value) + + def reset(self): + self.metrics = {} + + def get_metric(self, name): + return self.metrics.get(name) diff --git a/codeguru_profiler_agent/metrics/with_timer.py b/codeguru_profiler_agent/metrics/with_timer.py new file mode 100644 index 0000000..217c03b --- /dev/null +++ b/codeguru_profiler_agent/metrics/with_timer.py @@ -0,0 +1,34 @@ +from __future__ import absolute_import + +from time import perf_counter +from time import process_time + + +def with_timer(metric_name, measurement="cpu-time"): + """ + The decorator only works on all methods under the class which contains a Timer (with name timer). + """ + + def wrapper(fn): + if measurement == "cpu-time": + get_time_seconds = process_time + elif measurement == "wall-clock-time": + get_time_seconds = perf_counter + else: + raise Exception( + "Unexpected measurement mode for timer '{}'".format( + str(measurement))) + + def timed(self, *args, **kwargs): + if self.timer is None: + return fn(self, *args, **kwargs) + time_start_seconds = get_time_seconds() + result = fn(self, *args, **kwargs) + self.timer.record(metric_name, + get_time_seconds() - time_start_seconds) + + return result + + return timed + + return wrapper diff --git a/codeguru_profiler_agent/model/__init__.py b/codeguru_profiler_agent/model/__init__.py new file mode 100644 index 0000000..b6e690f --- /dev/null +++ b/codeguru_profiler_agent/model/__init__.py @@ -0,0 +1 @@ +from . import * diff --git a/codeguru_profiler_agent/model/call_graph_node.py b/codeguru_profiler_agent/model/call_graph_node.py new file mode 100644 index 0000000..f128618 --- /dev/null +++ b/codeguru_profiler_agent/model/call_graph_node.py @@ -0,0 +1,108 @@ +class CallGraphNode: + # Python magic: By declaring which fields/slots this class is going to have in advance, Python dispenses with the + # internal dictionary where these are usually stored, which noticeably improves memory footprint, which we are + # trying to optimize for with this class. + # + # Note that of course these need to be maintained in sync with the fields being used by the class. + __slots__ = ("frame_name", "class_name", "file_path", "runnable_count", "start_line", "end_line", "children", + "memory_counter") + + def __init__(self, frame_name, class_name, file_path, line_no, memory_counter=None): + """ + A node represents a given stack frame at a given position in the stack: + * it can have children -- frames that were observed to be on top of this frame in some samples + * it will probably have a parent (except for the root node), but does not keep a reference to it + It also keeps track of how many times (per thread state) this frame was observed in thread stacks across + one or more samples. + + :param frame_name: name of the stack frame + :param class_name: name of the class for where the method is sampled; or None if not applicable + :param file_path: the absolute path for the file containing the frame; or None if not applicable + :param line_no: the line_no where we observed this node; or None if not applicable + """ + self.frame_name = frame_name + # For normal usage of class, we are able to extract the class name from solution mentioned on + # https://stackoverflow.com/questions/2203424/python-how-to-retrieve-class-information-from-a-frame-object/2544639#2544639 + # Note that this solution relies on the assumption that the class method has taken self as its first argument + self.class_name = class_name + # In well-behaved Python code (not messing with importlib.reload and with dynamically changing the search path), + # the same frame_name will always correspond to the same file. Since we don't really think those use cases are + # common, for now we just assume this is the case and don't check for it + self.file_path = file_path + self.runnable_count = 0 + # the start and end of the range of line number where we observed this node + # None is expected for root node and agent duration metric node + self.start_line = line_no + self.end_line = line_no + self.children = () + self.memory_counter = memory_counter + if memory_counter: + memory_counter.count_create_node(frame_name, file_path, class_name) + + def update_current_node_and_get_child(self, frame): + """ + According to https://docs.python.org/3.3/tutorial/modules.html#the-module-search-path, it is not possible + to have same module:function existing in two different files. Therefore, we only compare nodes by + its frame_name but not by its file_path. + """ + node = self._get_child(frame=frame) or \ + self._insert_new_child( + CallGraphNode(frame_name=frame.name, class_name=frame.class_name, file_path=frame.file_path, + line_no=frame.line_no)) + node._maybe_update_line_no(frame.line_no) + return node + + def increase_runnable_count(self, value_to_add=1): + if value_to_add < 0: + raise ValueError( + "Cannot add negative counts to node: {}".format(value_to_add)) + + self.runnable_count += value_to_add + + def _maybe_update_line_no(self, line_no): + if line_no is None: + return + if self.start_line is None or self.start_line > line_no: + self.start_line = line_no + if self.end_line is None or self.end_line < line_no: + self.end_line = line_no + + def is_node_match_frame(self, other_frame): + return self.frame_name == other_frame.name and self.class_name == other_frame.class_name and \ + self.file_path == other_frame.file_path + + # FIXME: Review use of tuple vs list vs dictionary, and linear search vs binary search + def _get_child(self, frame): + for child in self.children: + if child.is_node_match_frame(frame): + return child + return None + + def _insert_new_child(self, new_child): + """ + FIXME: We still need to review the memory vs cpu tradeoffs of using a tuple vs an list vs a dictionary here, + and if we should keep it sorted (like the Java agent) or keep using the current approach. + + Right now we use a tuple as it uses the least amount of memory (and it simplifies the code, as the empty tuple + is reused by python): + + >>> import sys + >>> sys.getsizeof((1,)) + 64 + >>> sys.getsizeof([1]) + 80 + >>> sys.getsizeof((1, 2)) + 72 + >>> sys.getsizeof([1, 2]) + 88 + + :param new_child: graph node that holds the new child frame + :return: + """ + if self.memory_counter: + if self.children: + self.memory_counter.count_add_child() + else: + self.memory_counter.count_first_child() + self.children = self.children + (new_child, ) + return new_child diff --git a/codeguru_profiler_agent/model/frame.py b/codeguru_profiler_agent/model/frame.py new file mode 100644 index 0000000..4dcd9c1 --- /dev/null +++ b/codeguru_profiler_agent/model/frame.py @@ -0,0 +1,9 @@ +class Frame: + + __slots__ = ("name", "class_name", "line_no", "file_path") + + def __init__(self, name, class_name=None, line_no=None, file_path=None): + self.name = name + self.class_name = class_name + self.line_no = line_no + self.file_path = file_path diff --git a/codeguru_profiler_agent/model/memory_counter.py b/codeguru_profiler_agent/model/memory_counter.py new file mode 100644 index 0000000..42da3b7 --- /dev/null +++ b/codeguru_profiler_agent/model/memory_counter.py @@ -0,0 +1,39 @@ +from sys import getsizeof + +from codeguru_profiler_agent.model.call_graph_node import CallGraphNode + + +class MemoryCounter: + + empty_node = CallGraphNode(frame_name="empty_node", class_name=None, file_path=None, line_no=None) + python_int_size = getsizeof(2**40) + runnable_counter_size = python_int_size + # As we store the min and max of line number for each frame, we would need to add these to our memory + # estimation twice. + line_no_size = 2 * python_int_size + + empty_node_size_bytes = getsizeof(empty_node) + runnable_counter_size + base_storage_size_bytes = getsizeof((empty_node, )) + storage_increment_size_bytes = getsizeof( + (empty_node, empty_node)) - base_storage_size_bytes + + def __init__(self): + self.memory_usage_bytes = 0 + + def get_memory_usage_bytes(self): + return self.memory_usage_bytes + + def count_create_node(self, frame, file_path, class_name): + self.memory_usage_bytes += MemoryCounter.empty_node_size_bytes + self.memory_usage_bytes += getsizeof(frame) + self.memory_usage_bytes += getsizeof(file_path) + self.memory_usage_bytes += getsizeof(class_name) + # For simplicity, we assume all nodes contain line_no and we only expect root node and + # duration metric node not to have line_no. + self.memory_usage_bytes += MemoryCounter.line_no_size + + def count_first_child(self): + self.memory_usage_bytes += MemoryCounter.base_storage_size_bytes + + def count_add_child(self): + self.memory_usage_bytes += MemoryCounter.storage_increment_size_bytes diff --git a/codeguru_profiler_agent/model/profile.py b/codeguru_profiler_agent/model/profile.py new file mode 100644 index 0000000..8ab43e3 --- /dev/null +++ b/codeguru_profiler_agent/model/profile.py @@ -0,0 +1,143 @@ +import time + +from random import SystemRandom +from codeguru_profiler_agent.model.call_graph_node import CallGraphNode +from codeguru_profiler_agent.model.frame import Frame +from codeguru_profiler_agent.model.memory_counter import MemoryCounter +from codeguru_profiler_agent.utils.time import current_milli_time +from codeguru_profiler_agent.utils.time import to_iso + +ROOT_NODE_NAME = "ALL" + + +class Profile: + def __init__(self, profiling_group_name, sampling_interval_seconds, host_weight, start, clock=time.time): + """ + A profile holds the root node of the call graph and the metadata related to the profile + """ + self.memory_counter = MemoryCounter() + + self.profiling_group_name = profiling_group_name + self.callgraph = CallGraphNode(ROOT_NODE_NAME, class_name=None, file_path=None, line_no=None, + memory_counter=self.memory_counter) + self._validate_positive_number(start) + self.start = start + self.last_resume = start + self.last_pause = None + self._paused_ms = 0 + self._clock = clock + self._end = None + self.cpu_time_seconds = None + self.total_attempted_sample_threads_count = 0 + self.total_seen_threads_count = 0 + self.total_sample_count = 0 + self.sampling_interval_ms = int(sampling_interval_seconds * 1000) + self.host_weight = int(host_weight) + self._start_process_time = time.process_time() # provides process time in fractional seconds as float. + self.overhead_ms = 0 + + @property + def end(self): + return self._end + + @end.setter + def end(self, value): + self._validate_positive_number(value) + if value <= self.start: + raise ValueError( + "Profile end value must be bigger than {}, got {}".format(self.start, value)) + self._end = value + # this is the total cpu time spent in this application since start, not just the overhead + self.cpu_time_seconds = time.process_time() - self._start_process_time + + def get_active_millis_since_start(self): + """ + This returns the total "active" wall clock time since start. In AWS lambda the process can be frozen, the + time while we are frozen should not be counted in here. In an EC2 or other type of host it is simply the wall + clock time since start. + If the "end" time has been set, we give the active time between start and end otherwise between start and now + """ + end = self.last_pause if self.last_pause is not None else \ + self._end if self._end is not None else current_milli_time(clock=self._clock) + return end - self.start - self._paused_ms + + def add(self, sample): + """ + Merge Sample into the call graph. + """ + self.total_attempted_sample_threads_count += \ + sample.attempted_sample_threads_count + self.total_seen_threads_count += \ + sample.seen_threads_count + self.total_sample_count += 1 + + for stack in sample.stacks: + self._insert_stack(stack) + + def set_overhead_ms(self, duration_timedelta): + """ + The overhead is the total cpu time spent profiling since start. It is measured by a Timer object and only passed + to the profile object before we report it so it is only added here because it is more convenient to convey this + data with the rest of the profile data. + """ + self.overhead_ms = duration_timedelta.total_seconds() * 1000 + + def _insert_stack(self, stack, runnable_count_increase=1): + current_node = self.callgraph + + # navigate to the end of the stack in the graph, adding nodes when necessary + for frame in stack: + current_node = current_node.update_current_node_and_get_child(frame) + + # only increment the end of the stack as we use self time in the graph + current_node.increase_runnable_count(runnable_count_increase) + + def get_memory_usage_bytes(self): + return self.memory_counter.get_memory_usage_bytes() + + def pause(self): + if self.last_pause is not None: + # pause gets called when profile is paused + return + self.last_pause = current_milli_time(clock=self._clock) + self.last_resume = None + + def resume(self): + if self.last_resume is not None: + # resume gets called when profile is running + return + self.last_resume = current_milli_time(clock=self._clock) + self._paused_ms += self.last_resume - self.last_pause + self.last_pause = None + + def is_empty(self): + return self.total_seen_threads_count == 0.0 + + @staticmethod + def _validate_positive_number(value): + if value <= 0: + raise ValueError( + "Value must be bigger than 0, got {}".format(value)) + + def average_thread_weight(self): + """ + The average thread weight can be used to detect if the samples contained + in a given profile were taken from all of the application threads, or + just from a smaller subset, and thus to rescale counts when profiles + from several machines are aggregated. + + This value will be 1.0 if all threads were sampled, and > 1.0 if a + subset was chosen. + """ + if self.total_attempted_sample_threads_count == 0: + return 1.0 + + return self.total_seen_threads_count / float( + self.total_attempted_sample_threads_count) + + def __str__(self): + return "Profile(profiling_group_name=" + self.profiling_group_name \ + + ", start=" + to_iso(self.start) \ + + ', end=' + to_iso(self.end) \ + + ', duration_ms=' + str(self.get_active_millis_since_start()) \ + + ')' diff --git a/codeguru_profiler_agent/model/sample.py b/codeguru_profiler_agent/model/sample.py new file mode 100644 index 0000000..7b1178f --- /dev/null +++ b/codeguru_profiler_agent/model/sample.py @@ -0,0 +1,12 @@ +class Sample: + def __init__(self, stacks, attempted_sample_threads_count=0, seen_threads_count=0): + """ + :param stacks: list of lists; each list is a list of Frame object representing a thread stack in bottom (of thread stack) to top (of thread stack) order + :param start_time: current time (in ms) just before we started taking the sample + :param end_time: current time (in ms) just after we started taking the sample + :param attempted_sample_threads_count: how many threads we tried to sample (can be > than len(stacks) if we could not get/excluded some threads) + :param seen_threads_count: total number of threads observed in the system when we took the sample + """ + self.stacks = stacks + self.attempted_sample_threads_count = attempted_sample_threads_count + self.seen_threads_count = seen_threads_count diff --git a/codeguru_profiler_agent/profiler.py b/codeguru_profiler_agent/profiler.py new file mode 100644 index 0000000..99b54d0 --- /dev/null +++ b/codeguru_profiler_agent/profiler.py @@ -0,0 +1,278 @@ +import logging +import re +import datetime +import uuid +import threading + +from datetime import timedelta +from random import SystemRandom +from types import MappingProxyType as UnmodifiableDict +from codeguru_profiler_agent.agent_metadata.agent_metadata import AgentMetadata +from codeguru_profiler_agent.profiler_disabler import ProfilerDisabler +from codeguru_profiler_agent.reporter.agent_configuration import AgentConfiguration, AgentConfigurationMerger + +from codeguru_profiler_agent.metrics.timer import Timer +from codeguru_profiler_agent.profiler_runner import ProfilerRunner +from codeguru_profiler_agent.file_reporter.file_reporter import FileReporter +from codeguru_profiler_agent.local_aggregator import LocalAggregator +from codeguru_profiler_agent.sdk_reporter.sdk_reporter import SdkReporter +from codeguru_profiler_agent.codeguru_client_builder import CodeGuruClientBuilder + +INITIAL_MINIMUM_REPORTING_INTERVAL = timedelta(seconds=30) + +DEFAULT_CPU_LIMIT_PERCENTAGE = 10 +DEFAULT_REPORTING_INTERVAL = datetime.timedelta(minutes=5) +DEFAULT_SAMPLING_INTERVAL = datetime.timedelta(seconds=1.0) +DEFAULT_MAX_STACK_DEPTH = 1000 +# TODO: Review the memory limit +DEFAULT_MEMORY_LIMIT_BYTES = 10 * 1024 * 1024 + +# Skip issue reported by Bandit. +# [B108:hardcoded_tmp_directory] Probable insecure usage of temp file/directory. +# https://bandit.readthedocs.io/en/latest/plugins/b108_hardcoded_tmp_directory.html +# This file can be used by the customer as kill switch for the Profiler. +KILLSWITCH_FILEPATH = "/var/tmp/killProfiler" #nosec + +logger = logging.getLogger(__name__) + +# this lock is used for checking the singleton when we start and stop the profiler +start_profiler_lock = threading.Lock() + + +class Profiler: + """ + The Profiler Python agent provides a lightweight way of monitoring your service performance. It uses + collected samples to provide information about what's on the stack of each thread in the python interpreter. + You MUST provide a profiling_group_name. + """ + + # this singleton instance is used to make sure only one profiler instance is started at a time. + _active_profiler = None + + def __init__(self, + profiling_group_name, + region_name=None, + aws_session=None, + environment_override=dict()): + """ + NOTE: The profiler MUST be instantiated using keyword arguments. We provide no compatibility for the order + (or number) of arguments. + + - Configuration + + :param profiling_group_name: name of the profiling group where the profiles will be stored. + :param region_name: AWS Region to report to, given profiling group name must exist in that region. Note + that this value overwrites what is used in aws_session. If not provided, boto3 will search + configuration for the region. (e.g. "us-west-2") + :param aws_session: The boto3.Session that this profiler should be using for communicating with the backend + Check https://boto3.amazonaws.com/v1/documentation/api/latest/guide/session.html for more details. + + - Advanced Configuration Options - We recommend not to touch these + :param environment_override: custom dependency container dictionary. allows custom behavior to be injected + but please note that we do not guarantee compatibility between any different profiler agent versions for + this api (default: dict()). Possible keys: + - reporting_interval: delay between profile reports in datetime.timedelta (default: None) + - sampling_interval: delay between each sample in datetime.timedelta (default: 1 seconds) + - reporting_mode: Reporting mode to be used, two modes are supported: "codeguru_service" and "file". + "file" mode is only used for testing at the moment. (default: "codeguru_service") + - file_prefix: path + file prefix to use for profile reports when in "file" reporting mode + (default: './profile-{profiling_group_name}' only used when reporting mode is "file") + - cpu_limit_percentage: cpu limit (%) for profiler (default: 30) + - max_threads: the max number of threads getting sampled (default: 100) + - killswitch_filepath: file path pointing to the killswitch file (default: "/var/tmp/killProfiler") + - host_weight: A scale factor used to rescale the profile collected in this host to make the profile + representative of the whole fleet (default: 1) + - endpoint_url: url used for submitting profile (default: None, will target codeguru prod APIs) + - excluded_threads: set of thread names to be excluded from sampling (default: set()) + """ + self._profiler_runner_instance = None + self.environment = {} + try: + if not profiling_group_name: + logger.info( + "Profiler must be passed a non empty profiling group name, CodeGuru Profiler will not start. " + "Please specify a ``profiling_group_name`` when configuring the ``Profiler`` class." + ) + return + + # This is the profiler instance-wide dependency container aka environment + # It is used to contain shared dependencies and configuration, and can also be used to override the behavior + # on the profiler classes without needing to monkey-patch. + self.environment = self._set_default_environment(profiling_group_name) + self.environment["profiling_group_name"] = profiling_group_name + self.environment["region_name"] = region_name + self.environment["aws_session"] = aws_session + + default_config = AgentConfiguration( + should_profile=self.environment["should_profile"], + sampling_interval=self.environment["sampling_interval"], + reporting_interval=self.environment["reporting_interval"], + minimum_time_reporting=self.environment["minimum_time_reporting"], + max_stack_depth=self.environment["max_stack_depth"], + cpu_limit_percentage=self.environment["cpu_limit_percentage"]) + user_overrides = AgentConfiguration( + should_profile=environment_override.get('should_profile'), + sampling_interval=environment_override.get('sampling_interval'), + reporting_interval=environment_override.get('reporting_interval'), + minimum_time_reporting=environment_override.get('minimum_time_reporting'), + max_stack_depth=environment_override.get('max_stack_depth'), + cpu_limit_percentage=environment_override.get('cpu_limit_percentage')) + agent_config_merger = AgentConfigurationMerger(default=default_config, user_overrides=user_overrides) + self.environment["agent_config_merger"] = agent_config_merger + + # Removing all keys from the environment that were used for the AgentConfigurationMerger + # to make sure the rest of the code would read it from the AgentConfiguration. + for key in default_config.as_dict().keys(): + del self.environment[key] + for key in user_overrides.as_dict().keys(): + del environment_override[key] + + self.environment = self._setup_final_environment(self.environment, environment_override) + profiler_runner_factory = self.environment.get("profiler_runner_factory") or ProfilerRunner + self._profiler_runner_instance = profiler_runner_factory(environment=self.environment) + except: + logger.info("Caught exception while creating the CodeGuru Profiler Agent instance", exc_info=True) + if environment_override.get("allow_top_level_exceptions") is True: + raise + + @staticmethod + def _set_default_environment(profiling_group_name): + return { + 'timer': Timer(), + 'profiler_thread_name': 'codeguru-profiler-agent-' + str(uuid.uuid4()).replace('-', ''), + 'reporting_mode': 'codeguru_service', + 'file_prefix': 'profile-{}'.format(re.sub(r"\W", "", profiling_group_name)), + 'excluded_threads': set(), + 'should_profile': True, + 'sampling_interval': DEFAULT_SAMPLING_INTERVAL, + 'reporting_interval': DEFAULT_REPORTING_INTERVAL, + 'minimum_time_reporting': INITIAL_MINIMUM_REPORTING_INTERVAL, + 'max_stack_depth': DEFAULT_MAX_STACK_DEPTH, + 'cpu_limit_percentage': DEFAULT_CPU_LIMIT_PERCENTAGE, + 'memory_limit_bytes': DEFAULT_MEMORY_LIMIT_BYTES, + 'host_weight': 1.0, + 'killswitch_filepath': KILLSWITCH_FILEPATH, + 'max_threads': 100 + } + + def _setup_final_environment(self, environment, environment_override): + environment.update(environment_override) + + # set additional parameters if needed (costly default init or depend on other parameters) + if environment.get('initial_sampling_interval') is None: + environment['initial_sampling_interval'] = datetime.timedelta( + seconds=SystemRandom().uniform(0, AgentConfiguration.get().sampling_interval.total_seconds())) + environment['excluded_threads'] = \ + frozenset({environment['profiler_thread_name']}.union(environment['excluded_threads'])) + # TODO delay metadata lookup until we need it + environment['agent_metadata'] = environment.get('agent_metadata') or AgentMetadata() + environment['collector'] = environment.get('collector') or self._select_collector(environment) + environment["profiler_disabler"] = environment.get('profiler_disabler') or ProfilerDisabler(environment) + return UnmodifiableDict(environment) + + @staticmethod + def _select_collector(environment): + reporting_mode = environment.get('reporting_mode') + if reporting_mode == "codeguru_service": + environment["codeguru_profiler_builder"] = CodeGuruClientBuilder(environment) + return LocalAggregator( + reporter=SdkReporter(environment=environment), + environment=environment) + elif reporting_mode == "file": + return LocalAggregator( + reporter=FileReporter(environment=environment), + environment=environment) + else: + raise ValueError("Invalid reporting mode for CodeGuru Profiler detected: {}".format(reporting_mode)) + + def start(self, block=False): + """ + Start executing the profiler with the settings provided on creation. If the profiler was paused + it will resume profiling. + Resuming profiling is done asynchronously by default so the method can return before the action + is effectively applied. + TODO Currently we can start multiple instances of Profiler which we want to avoid. Implement a singleton + solution here so that we can only start one instance of Profiler (similar to what we have for java agent) + :param block: if True, we will not return from this function before the profiling started, default is False. + :return: True if the profiler was started successfully; False otherwise. + """ + try: + if not self.is_running(): + return self._start(block) + else: + logger.debug("Resuming CodeGuru Profiler activity") + self._profiler_runner.resume(block) + return True + except: + logger.info("Caught exception while trying to start the CodeGuru Profiler Agent", exc_info=True) + return False + + def _start(self, block=False): + logger.debug("Attempting to start the CodeGuru profiler") + with start_profiler_lock: + if Profiler._active_profiler is not None and Profiler._active_profiler != self: + logger.info("Starting multiple instances of profiler agents is not allowed. " + "Please validate the configuration of the profiler. " + "If this is intentional, then stop the active profiler agent before starting a new one.") + return False + logger.info("Starting profiler, " + str(self)) + if not self._profiler_runner.start(): + logger.info("CodeGuru Profiler Agent failed to start.") + return False + Profiler._active_profiler = self + return True + + def is_running(self): + """ + Determine if the Profiler is actually running or not. + + :return: True if the profiler is running. + """ + try: + return self._profiler_runner.is_running() + except: + logger.info("Unable to detect if the CodeGuru Profiler is running.", exc_info=True) + return False + + def stop(self): + """ + Terminates the profiler and flushes existing profile to the backend. If this method is called whilst + this instance of the profiler has never been started it will return False. + + :return: True if the profiler was stopped successfully or was already stopped; False otherwise. + """ + try: + with start_profiler_lock: + if Profiler._active_profiler == self: + self._profiler_runner.stop() + Profiler._active_profiler = None + return True + except: + logger.info("Caught exception while trying to stop the CodeGuru Profiler Agent", exc_info=True) + return False + + def pause(self, block=False): + """ + Pauses all profiler activity until start() is called again. + By default the pause will be done asynchronously so the method can return before pause is effectively applied. + + :param block: if True, we will not return from this function before the pause is applied, default is False. + """ + try: + logger.debug("Pausing CodeGuru Profiler activity") + self._profiler_runner.pause(block) + return True + except: + logger.info("Unable to pause the CodeGuru Profiler.", exc_info=True) + return False + + @property + def _profiler_runner(self): + if self._profiler_runner_instance: + return self._profiler_runner_instance + else: + raise Exception("CodeGuru Profiler was not correctly initialized; see previous error messages for cause") + + def __str__(self): + return 'Profiler(environment=' + str({k: self.environment.get(k) for k in + ['max_threads', 'profiling_group_name', 'region_name', 'aws_session']}) diff --git a/codeguru_profiler_agent/profiler_builder.py b/codeguru_profiler_agent/profiler_builder.py new file mode 100644 index 0000000..617feec --- /dev/null +++ b/codeguru_profiler_agent/profiler_builder.py @@ -0,0 +1,155 @@ +import os +import logging +import datetime +import boto3 + +from codeguru_profiler_agent.utils.log_exception import log_exception + +logger = logging.getLogger(__name__) + +# These variables names will be in external documentation and must be the same as for other agents (e.g. java agent) +# Do not change them unless you are sure. +# TODO: Should we move these logic to Profiler instead? +PG_NAME_ENV = "AWS_CODEGURU_PROFILER_GROUP_NAME" +PG_ARN_ENV = "AWS_CODEGURU_PROFILER_GROUP_ARN" +REGION_ENV = "AWS_CODEGURU_PROFILER_TARGET_REGION" +# Note: Please check function `check_credential_through_environment` before supporting the following env variable +CREDENTIAL_PATH = "AWS_CODEGURU_PROFILER_CREDENTIAL_PATH" +ENABLED_ENV = "AWS_CODEGURU_PROFILER_ENABLED" + +# non documented parameters +SAMPLING_INTERVAL = "AWS_CODEGURU_PROFILER_SAMPLING_INTERVAL_MS" +REPORTING_INTERVAL = "AWS_CODEGURU_PROFILER_REPORTING_INTERVAL_MS" + + +def _read_millis(override, env_name, override_key, env=os.environ): + value = env.get(env_name) + if value: + try: + override[override_key] = datetime.timedelta(milliseconds=int(value)) + except Exception: + logger.info("Unable to convert value to a time range for environment variable " + env_name) + + +def _read_override(env=os.environ): + override = dict() + _read_millis(override, SAMPLING_INTERVAL, "sampling_interval", env) + _read_millis(override, REPORTING_INTERVAL, "reporting_interval", env) + return override + + +def _read_profiling_group_arn(env=os.environ): + """ + Reads profiling group ARN from the environment and extract the name, region and account id from it. + Arn is expected to be in this format: + arn:aws:codeguru-profiler:us-west-2:003713371902:profilingGroup/profilingGroupName + :param env: typically os.environ + :return: (name, region, account id) + """ + arn = env.get(PG_ARN_ENV) + if not arn: + return None, None, None + parts = arn.split(':') + try: + return parts[5].split('/', 1)[1], parts[3], parts[4] + except Exception: + # print stack trace for unknown errors are they can help us investigate. + log_exception(logger, "Malformed profiling group arn, was expecting" + " arn:aws:codeguru-profiler:::profilingGroup/" + " but got: " + arn) + return None, None, None + + +def _get_profiling_group_name(pg_name=None, pg_name_from_arn=None, env=os.environ): + return pg_name or _get_profiling_group_name_from_env(pg_name_from_arn, env) + + +def _get_profiling_group_name_from_env(pg_name_from_arn=None, env=os.environ): + pg_name_from_env = env.get(PG_NAME_ENV) + if pg_name_from_env and pg_name_from_arn and pg_name_from_env != pg_name_from_arn: + logger.info("Different Profiling group name found from " + PG_NAME_ENV + " and " + PG_ARN_ENV + + " will use value from " + PG_ARN_ENV + " : " + pg_name_from_arn) + return pg_name_from_arn or pg_name_from_env + + +def _get_region(region_name=None, region_from_arn=None, env=os.environ): + return region_name or _get_region_from_env(region_from_arn, env) + + +def _get_region_from_env(region_from_arn=None, env=os.environ): + region_from_env = env.get(REGION_ENV) + if region_from_env and region_from_arn and region_from_env != region_from_arn: + logger.info("Different region found from " + REGION_ENV + " and " + PG_ARN_ENV + + " will use value from " + PG_ARN_ENV + " : " + region_from_arn) + return region_from_arn or region_from_env + + +def _is_enabled(env=os.environ): + """ + By default profiler is enabled, any value in the environment variable other than "true" (case-insensitive) will disable it + """ + enable_env_value = env.get(ENABLED_ENV, 'true').lower() + result = enable_env_value == 'true' + if not result: + logger.info(ENABLED_ENV + " is set to " + enable_env_value + ", CodeGuru Profiler is disabled") + return result + + +def _check_credential_through_environment(env=os.environ): + """ + According to https://boto3.amazonaws.com/v1/documentation/api/latest/guide/credentials.html#shared-credentials-file, + the credential file path gets set globally through setting the env variable AWS_SHARED_CREDENTIALS_FILE. We may + reconsider whether we should support this override or not. + """ + if env.get(CREDENTIAL_PATH): + logger.info("Credential detected from environment variable " + CREDENTIAL_PATH + ". Unfortunately, we " + + "do not support setting credential file path through env variable yet for Python agent. " + + "Please follow the guide on https://boto3.amazonaws.com/v1/documentation/api/latest/guide/credentials.html#shared-credentials-file " + + "and set the credential file path globally through env variable AWS_SHARED_CREDENTIALS_FILE") + + return + + +def build_profiler(pg_name=None, region_name=None, credential_profile=None, + env=os.environ, session_factory=boto3.session.Session, profiler_factory=None, override=None): + """ + Creates a Profiler object from given parameters or environment variables + :param pg_name: given profiling group name, default is None + :param region_name: given region name, default is None + :param credential_profile: Name of the profile created in credential file used for submitting profiles + :param env: environment variables are used if parameters are not provided, default is os.environ + :param session_factory: (For testing) function for creating boto3.session.Session, default is boto3.session.Session + :param override: a dictionary with possible extra parameters to override default values + :return: a Profiler object or None, this function does not throw exceptions + """ + if profiler_factory is None: + # We importing Profiler here rather than at the head is to avoid having import loop + from codeguru_profiler_agent.profiler import Profiler + profiler_factory = Profiler + try: + if not _is_enabled(env): + logger.info("CodeGuru Profiler is not started as it has been explicitly disabled. Set environment " + + "variable " + ENABLED_ENV + " to true if you wish to enable profiler.") + return None + + _check_credential_through_environment(env) + + name_from_arn, region_from_arn, _account_id = _read_profiling_group_arn(env) + profiling_group_name = _get_profiling_group_name(pg_name, name_from_arn, env) + if not profiling_group_name: + logger.info("Could not find a profiling group name to start the CodeGuru Profiler agent. " + + "Add command line argument or environment variable. e.g. " + PG_ARN_ENV) + return None + region = _get_region(region_name, region_from_arn, env) + session = session_factory(region_name=region, profile_name=credential_profile) + + override_values = _read_override(env) + if override: + override_values.update(override) + return profiler_factory(profiling_group_name=profiling_group_name, region_name=region, aws_session=session, + environment_override=override_values) + + except Exception: + # print stack trace for unknown errors are they can help us investigate. + logger.info("Unable to create profiler", exc_info=True) + return None diff --git a/codeguru_profiler_agent/profiler_disabler.py b/codeguru_profiler_agent/profiler_disabler.py new file mode 100644 index 0000000..c912e3d --- /dev/null +++ b/codeguru_profiler_agent/profiler_disabler.py @@ -0,0 +1,86 @@ +import os +import time +import logging +from codeguru_profiler_agent.reporter.agent_configuration import AgentConfiguration +from codeguru_profiler_agent.utils.time import current_milli_time + +logger = logging.getLogger(__name__) +CHECK_KILLSWITCH_FILE_INTERVAL_SECONDS = 60 +MINIMUM_MEASURES_IN_DURATION_METRICS = 20 +MINIMUM_SAMPLES_IN_PROFILE = 5 + + +class ProfilerDisabler: + """ + This class encapsulates all checks that can disable profiler + """ + + def __init__(self, environment, clock=time.time): + self.cpu_usage_check = CpuUsageCheck(environment['timer']) + self.killswitch = KillSwitch(environment['killswitch_filepath'], clock) + self.memory_limit_bytes = environment['memory_limit_bytes'] + + def should_stop_profiling(self, profile=None): + return (self.killswitch.is_killswitch_on() + or self.cpu_usage_check.is_cpu_usage_limit_reached(profile) + or profile is not None and self._is_memory_limit_reached(profile)) + + def _is_memory_limit_reached(self, profile): + return profile.get_memory_usage_bytes() > self.memory_limit_bytes + + +class CpuUsageCheck: + """ + Checks for process duration: we measure the actual wall clock duration of running the profiler, if this duration + becomes too long compared to the sampling interval, we stop profiling. + """ + + def __init__(self, timer): + self.timer = timer + + def is_cpu_usage_limit_reached(self, profile=None): + profiler_metric = self.timer.metrics.get("runProfiler") + if not profiler_metric or profiler_metric.counter < MINIMUM_MEASURES_IN_DURATION_METRICS: + return False + + sampling_interval_seconds = self._get_average_sampling_interval_seconds(profile) + used_time_percentage = 100 * profiler_metric.average() / sampling_interval_seconds + + if used_time_percentage >= AgentConfiguration.get().cpu_limit_percentage: + logger.debug(self.timer.metrics) + logger.info( + "Profiler cpu usage limit reached: {:.2f} % (limit: {:.2f} %), will stop CodeGuru Profiler.".format( + used_time_percentage, AgentConfiguration.get().cpu_limit_percentage)) + return True + else: + return False + + @staticmethod + def _get_average_sampling_interval_seconds(profile): + if profile is None or profile.total_sample_count < MINIMUM_SAMPLES_IN_PROFILE: + return AgentConfiguration.get().sampling_interval.total_seconds() + return (profile.get_active_millis_since_start() / profile.total_sample_count) / 1000 + + +class KillSwitch: + """ + Checks for a kill switch file: if a file with a specific name is present in the file system we stop profiling. + """ + + def __init__(self, killswitch_filepath, clock=time.time): + self.killswitch_filepath = killswitch_filepath + self.last_check_for_file_time = None + self.last_check_for_file_result = False + self.clock = clock + + def is_killswitch_on(self): + now = self.clock() + should_check_file = self.last_check_for_file_time is None or \ + now - self.last_check_for_file_time > CHECK_KILLSWITCH_FILE_INTERVAL_SECONDS + if should_check_file: + self.last_check_for_file_result = os.path.isfile(self.killswitch_filepath) + self.last_check_for_file_time = now + if self.last_check_for_file_result: + logger.info( + "Found kill-switch file at {}, will stop CodeGuru Profiler.".format(self.killswitch_filepath)) + return self.last_check_for_file_result diff --git a/codeguru_profiler_agent/profiler_runner.py b/codeguru_profiler_agent/profiler_runner.py new file mode 100644 index 0000000..4c3e845 --- /dev/null +++ b/codeguru_profiler_agent/profiler_runner.py @@ -0,0 +1,126 @@ +import logging + +from codeguru_profiler_agent.metrics.with_timer import with_timer +from codeguru_profiler_agent.reporter.agent_configuration import AgentConfiguration +from codeguru_profiler_agent.sampler import Sampler +from codeguru_profiler_agent.utils.scheduler import Scheduler + +logger = logging.getLogger(__name__) + + +class ProfilerRunner: + """ + ProfilerRunner instantiates and orchestrates all components required for running the profiler. + It implements and checks the kill switch and cpu limit every time before it samples. + This class should only be accessed by Profiler for controlling and monitoring the profiler. + + NOTE: If CPU usage exceed CPU limit, profiler will be terminated immediately without any attempt to flush + the existing data. + NOTE: Memory limit check is implemented in LocalAggregator + """ + + def __init__(self, environment=dict()): + """ + :param environment: dependency container dictionary for the current profiler + :param sampling_interval: (required inside environment) delay between profile reports in datetime.timedelta + :param killswitch_filepath: (required inside environment) filepath pointing to the killswitch file. This path + gets checked every time the profiler samples; the profiler is immediately stopped if this file exists. + :param collector: (required inside environment) collector object to handle sample processing + :param initial_sampling_interval: (required inside environment) Initial delay signal sampler takes for starting + to sample + :param profiler_thread_name: (required inside environment) Thread name used for running the + report_orchestration_scheduler + """ + self.timer = environment.get("timer") + self.sampler = environment.get("sampler") or Sampler(environment=environment) + + self.scheduler = Scheduler( + command=self._profiling_command, + delay_provider=lambda: AgentConfiguration.get().sampling_interval, + initial_delay=environment["initial_sampling_interval"], + thread_name=environment["profiler_thread_name"]) + self.collector = environment["collector"] + self.profiler_disabler = environment["profiler_disabler"] + self.is_profiling_in_progress = False + self._first_execution = True + + def start(self): + """ + Start running the profiler. + Note: Profiler will not start if killswitch file exists. + + :return: True if the profiler was started successfully; False otherwise. + """ + if self.profiler_disabler.should_stop_profiling(): + logger.info("Profiler will not start.") + return False + self.scheduler.start() + return True + + def _refresh_configuration(self): + self.collector.refresh_configuration() + self.is_profiling_in_progress = AgentConfiguration.get().should_profile + if self.is_profiling_in_progress: + self.scheduler.update_delay_provider(lambda: AgentConfiguration.get().sampling_interval) + else: + # if we should not profile we can simply wait for the reporting interval and call again at that time. + self.scheduler.update_delay_provider(lambda: AgentConfiguration.get().reporting_interval) + + def _profiling_command(self): + try: + if self._first_execution: + self.collector.setup() + self._first_execution = False + return self._run_profiler() + except: + logger.info("An unexpected issue caused the profiling command to terminate.", exc_info=True) + return False + + @with_timer("runProfiler") + def _run_profiler(self): + if self.profiler_disabler.should_stop_profiling(self.collector.profile): + return False + + if not self.is_profiling_in_progress: + self._refresh_configuration() + + # after the refresh we may be working on a profile + if self.is_profiling_in_progress: + sample = self.sampler.sample() + self.collector.add(sample) + if self.collector.flush(): + self.is_profiling_in_progress = False + return True + + def is_running(self): + return self.scheduler.is_running() + + def is_paused(self): + return self.scheduler.is_paused() + + def stop(self): + """ + Terminate profiler gracefully. + It terminates the profiling thread and flushes existing profile to the backend. + """ + self.scheduler.stop() + self.collector.flush(force=True) + self.is_profiling_in_progress = False + + def resume(self, block=False): + """ + Will signal the scheduler that profiling should resume. + + :param block: if True, we will not return from this function before the change is applied, default is False. + """ + self.collector.profile.resume() + self.scheduler.resume(block) + + def pause(self, block=False): + """ + Will signal the scheduler that profiling should pause. + + :param block: if True, we will not return from this function before the change is applied, default is False. + """ + self.scheduler.pause(block) + self.collector.profile.pause() diff --git a/codeguru_profiler_agent/reporter/__init__.py b/codeguru_profiler_agent/reporter/__init__.py new file mode 100644 index 0000000..b6e690f --- /dev/null +++ b/codeguru_profiler_agent/reporter/__init__.py @@ -0,0 +1 @@ +from . import * diff --git a/codeguru_profiler_agent/reporter/agent_configuration.py b/codeguru_profiler_agent/reporter/agent_configuration.py new file mode 100644 index 0000000..0b844b0 --- /dev/null +++ b/codeguru_profiler_agent/reporter/agent_configuration.py @@ -0,0 +1,160 @@ +import logging + +from datetime import timedelta + +# The keys for the Agent Configuration's parameters from the API configuration. +# https://docs.aws.amazon.com/codeguru/latest/profiler-api/API_AgentConfiguration.html +# ReportingIntervalInMilliseconds is intentionally left out as it doesn't make sense. + +MEMORY_USAGE_LIMIT_PERCENT_KEY = "MemoryUsageLimitPercent" +MINIMUM_TIME_FOR_REPORTING_MILLISECONDS_KEY = "MinimumTimeForReportingInMilliseconds" +SAMPLING_INTERVAL_MILLISECONDS_KEY = "SamplingIntervalInMilliseconds" +MAX_STACK_DEPTH_KEY = "MaxStackDepth" +PERIOD_IN_SECONDS_KEY = "periodInSeconds" +SHOULD_PROFILE_KEY = "shouldProfile" + +_singleton = None + +logger = logging.getLogger(__name__) + + +class AgentConfiguration: + """ + Singleton class that holds the configuration for the agent that can be updated based on a json configuration. + """ + + def __init__(self, should_profile=None, sampling_interval=None, reporting_interval=None, + minimum_time_reporting=None, max_stack_depth=None, cpu_limit_percentage=None): + self.should_profile = should_profile + self.sampling_interval = sampling_interval + self.reporting_interval = reporting_interval + self.minimum_time_reporting = minimum_time_reporting + self.max_stack_depth = max_stack_depth + self.cpu_limit_percentage = cpu_limit_percentage + if self._is_reporting_interval_smaller_than_minimum_allowed(): + raise ValueError( + "Configuration issue: reporting_interval cannot be smaller than {} (got {})".format( + minimum_time_reporting, str(self.reporting_interval))) + + def as_dict(self): + return {key: value for key, value in self.__dict__.items() if value is not None} + + @classmethod + def get(cls): + """ + Returns the singleton instance of the AgentConfiguration(). + :return: the one instance of the agent configuration + """ + if _singleton is None: + raise ValueError("Instance for singleton is None, use AgentConfigurationMerger for initialization.") + return _singleton + + @classmethod + def set(cls, current): + """ + Sets the singleton instance of the AgentConfiguration() with the one set as parameter. + """ + if current is None: + raise ValueError("You can not set a None instance for the singleton instance.") + global _singleton + _singleton = current + logger.info("New agent configuration: " + str(_singleton.as_dict())) + + @classmethod + def _get_new_config(cls, configure_agent_response=dict()): + agent_parameters_response = configure_agent_response.get("agentParameters", {}) + + current = AgentConfiguration.get() + return AgentConfiguration( + should_profile=configure_agent_response.get(SHOULD_PROFILE_KEY, current.should_profile), + reporting_interval=cls._get_interval_from_seconds(key=PERIOD_IN_SECONDS_KEY, + response=configure_agent_response, + default_value=current.reporting_interval), + sampling_interval=cls._get_interval_from_milliseconds(key=SAMPLING_INTERVAL_MILLISECONDS_KEY, + response=agent_parameters_response, + default_value=current.sampling_interval), + minimum_time_reporting=cls._get_interval_from_milliseconds(key=MINIMUM_TIME_FOR_REPORTING_MILLISECONDS_KEY, + response=agent_parameters_response, + default_value=current.minimum_time_reporting), + max_stack_depth=cls._get_int_value_from(key=MAX_STACK_DEPTH_KEY, + response=agent_parameters_response, + default_value=current.max_stack_depth), + cpu_limit_percentage=cls._get_int_value_from(key=MEMORY_USAGE_LIMIT_PERCENT_KEY, + response=agent_parameters_response, + default_value=current.cpu_limit_percentage) + ) + + @classmethod + def _get_interval_from_milliseconds(cls, key, response, default_value): + return cls._get_interval_from(key, response, default_value, in_milliseconds=True) + + @classmethod + def _get_interval_from_seconds(cls, key, response, default_value): + return cls._get_interval_from(key, response, default_value, in_seconds=True) + + @classmethod + def _get_interval_from(cls, key, response, default_value, in_seconds=False, in_milliseconds=False): + if not in_seconds ^ in_milliseconds: + raise ValueError("Only one of the parameters 'in_seconds' and 'in_milliseconds' must be true.") + int_value = cls._get_int_value_from(key, response) + if int_value is None: + logger.debug("Setting back to previous value: " + str(default_value)) + return default_value + if in_seconds: + return timedelta(seconds=int_value) + else: + return timedelta(milliseconds=int_value) + + @classmethod + def _get_int_value_from(cls, key, response, default_value=None): + if key not in response: + logger.debug("The response '{}' does not contain key '{}'.".format(response, key)) + return default_value + + value = response.get(key) + try: + return int(value) + except Exception: + logger.info("The response contains invalid integer value '{}' for key '{}'.".format(value, key), + exc_info=False) + return default_value + + def is_under_min_reporting_time(self, time_delay_since_last_report): + return time_delay_since_last_report < self.minimum_time_reporting.total_seconds() * 1000 + + def is_over_reporting_interval(self, time_delay_since_last_report): + return time_delay_since_last_report > self.reporting_interval.total_seconds() * 1000 + + def _is_reporting_interval_smaller_than_minimum_allowed(self): + if self.reporting_interval is None or self.minimum_time_reporting is None: + return False + return self.reporting_interval < self.minimum_time_reporting + + +class AgentConfigurationMerger: + + def __init__(self, default=AgentConfiguration(), user_overrides=AgentConfiguration()): + self.default = default.as_dict() + self.user_overrides = user_overrides.as_dict() + self._merge_and_set() + + def merge_with(self, configure_agent_response): + """ + This refreshes only the configuration that was NOT overridden by the customer and sets the singleton instance. + :param configure_agent_response: the dict from the backend response + """ + orchestration_config = AgentConfiguration._get_new_config(configure_agent_response) + self._merge_and_set(orchestration_overrides=orchestration_config.as_dict()) + + def _merge_and_set(self, orchestration_overrides=dict()): + merged = self.default + merged.update(orchestration_overrides) + merged.update(self.user_overrides) + AgentConfiguration.set(AgentConfiguration(**merged)) + + def disable_profiling(self): + """ + Only sets should_profile to False, other values in the configuration are unchanged. + This is used for error handling when we call orchestrator. + """ + self._merge_and_set(orchestration_overrides=AgentConfiguration(should_profile=False).as_dict()) diff --git a/codeguru_profiler_agent/reporter/reporter.py b/codeguru_profiler_agent/reporter/reporter.py new file mode 100644 index 0000000..d773007 --- /dev/null +++ b/codeguru_profiler_agent/reporter/reporter.py @@ -0,0 +1,33 @@ +from abc import ABCMeta, abstractmethod + + +class Reporter(metaclass=ABCMeta): # pragma: no cover + """ + A reporter to be used by the aggregator. + """ + + @abstractmethod + def setup(self): + """ + Setup expensive resources. + """ + pass + + @abstractmethod + def refresh_configuration(self): + """ + Configure agent by calling the profiler backend service. + + :return: the updated agent configuration + """ + pass + + @abstractmethod + def report(self, profile): + """ + Report profile. + + :param profile: profile to be reported + :return: True if profile gets reported successfully; False otherwise. + """ + pass diff --git a/codeguru_profiler_agent/sampler.py b/codeguru_profiler_agent/sampler.py new file mode 100644 index 0000000..665046b --- /dev/null +++ b/codeguru_profiler_agent/sampler.py @@ -0,0 +1,65 @@ +import logging +import random +import sys + +import codeguru_profiler_agent.sampling_utils +from codeguru_profiler_agent.metrics.with_timer import with_timer +from codeguru_profiler_agent.model.sample import Sample +from codeguru_profiler_agent.reporter.agent_configuration import AgentConfiguration + +logger = logging.getLogger(__name__) + + +class Sampler: + """ + Returns a Sample containing the stack frames for the currently running threads. + + A Sample includes the thread doing the sample, unless that thread is included in the `excluded_threads` set. + (Within the context of a running Profiler, the Profiler top-level class is currently responsible for adding the + Profiler background thread to the `excluded_threads` so that it does not appear in the call graph.) + """ + + def __init__(self, environment=dict()): + """ + :param environment: dependency container dictionary for the current profiler + :param max_threads: (inside environment) the max number of threads getting sampled + :param excluded_threads: (inside environment) set of thread names to be excluded from sampling + """ + self._max_threads = environment.get("max_threads") or 100 + self._excluded_threads = environment.get("excluded_threads") or set() + self._get_stacks = \ + environment.get("get_stacks") or codeguru_profiler_agent.sampling_utils.get_stacks + self._thread_lister = environment.get("thread_lister") or sys + self.timer = environment.get("timer") + + @with_timer("dumpAllStackTraces") + def sample(self): + """ + Samples stack traces of running threads (up to max_threads, and excluding excluded_threads) running in the + current Python instance. Any exception encountered during sampling process will be propagated. + """ + all_threads = self._get_all_threads() + all_threads_count = len(all_threads) + threads_to_sample = self._threads_to_sample_from(all_threads) + threads_to_sample_count = len(threads_to_sample) + + stacks = self._get_stacks( + threads_to_sample=threads_to_sample, + excluded_threads=self._excluded_threads, + max_depth=AgentConfiguration.get().max_stack_depth) + + # Memory usage optimization + del all_threads + del threads_to_sample + + return Sample(stacks=stacks, attempted_sample_threads_count=threads_to_sample_count, + seen_threads_count=all_threads_count) + + def _get_all_threads(self): + return self._thread_lister._current_frames().items() + + def _threads_to_sample_from(self, all_threads): + if len(all_threads) > self._max_threads: + return random.sample(all_threads, self._max_threads) + else: + return all_threads diff --git a/codeguru_profiler_agent/sampling_utils.py b/codeguru_profiler_agent/sampling_utils.py new file mode 100644 index 0000000..360c4c1 --- /dev/null +++ b/codeguru_profiler_agent/sampling_utils.py @@ -0,0 +1,89 @@ +""" +This module handles all interactions with python sys and traceback library for sampling. +""" +import linecache +import threading +import traceback + +from codeguru_profiler_agent.model.frame import Frame + +TRUNCATED_FRAME = Frame(name="") + +TIME_SLEEP_FRAME = Frame(name="") + + +def get_stacks(threads_to_sample, excluded_threads, max_depth): + """ + Attempts to extract the call stacks for the threads listed in threads_to_sample. + + :param threads_to_sample: list of threads to be sampled, expected in the same format as sys._current_frames().items() + :param excluded_threads: set of thread names to be excluded from sampling + :param max_depth: the maximum number of frames a stack can have + :returns: a list of lists of call stacks of all chosen threads; any thread stacks deeper than max_depth will be truncated and the TRUNCATED_FRAME_NAME will be added as a replacement of the **TOPMOST** frames of the stack + """ + stacks = [] + if max_depth < 0: + max_depth = 0 + for thread_id, end_frame in threads_to_sample: + if _is_excluded(thread_id, excluded_threads): + continue + + stacks.append(_extract_frames(end_frame, max_depth)) + + return stacks + + +def _is_zombie(thread): + return True if thread is None else False + + +def _is_excluded(thread_id, excluded_threads): + thread = threading._active.get(thread_id) + return _is_zombie(thread) or thread.name in excluded_threads + + +def _extract_class(frame_locals): + """ + See https://stackoverflow.com/questions/2203424/python-how-to-retrieve-class-information-from-a-frame-object/2544639#2544639 + for the details behind the implementation. The way to use to extract class from method relies on the fact that + "self" is passed as an argument in the function and it points to the class which owns this function. + """ + try: + return frame_locals['self'].__class__.__name__ + except Exception: + # Fail to get the class name should not block the whole sample + return None + + +def _extract_stack(stack, max_depth): + """Create a list of Frame from a list of FrameSummary. + + :param stack: A list of FrameSummary. + """ + result = [] + for raw_frame, line_no in stack: + co = raw_frame.f_code + result.append( + Frame(name=co.co_name, class_name=_extract_class(raw_frame.f_locals), line_no=line_no, + file_path=co.co_filename) + ) + if len(stack) < max_depth: + last_frame, last_frame_line_no = stack[-1] + _maybe_append_time_sleep(result, last_frame, last_frame_line_no) + return result + + +def _maybe_append_time_sleep(result, frame, line_no): + line = linecache.getline(frame.f_code.co_filename, line_no).strip() + if "sleep(" in line: + result.append(TIME_SLEEP_FRAME) + + +def _extract_frames(end_frame, max_depth): + stack = list(traceback.walk_stack(end_frame))[::-1][0:max_depth] + stack_entries = _extract_stack(stack, max_depth) + + if len(stack_entries) == max_depth: + stack_entries[-1] = TRUNCATED_FRAME + + return stack_entries diff --git a/codeguru_profiler_agent/sdk_reporter/__init__.py b/codeguru_profiler_agent/sdk_reporter/__init__.py new file mode 100644 index 0000000..b6e690f --- /dev/null +++ b/codeguru_profiler_agent/sdk_reporter/__init__.py @@ -0,0 +1 @@ +from . import * diff --git a/codeguru_profiler_agent/sdk_reporter/profile_encoder.py b/codeguru_profiler_agent/sdk_reporter/profile_encoder.py new file mode 100644 index 0000000..315f3bf --- /dev/null +++ b/codeguru_profiler_agent/sdk_reporter/profile_encoder.py @@ -0,0 +1,195 @@ +import json +import gzip +import io +import platform +import sys +from functools import lru_cache +import os +from pathlib import Path + +GZIP_BALANCED_COMPRESSION_LEVEL = 6 +DEFAULT_FRAME_COMPONENT_DELIMITER = ":" + + +def _get_module_path(file_path, sys_paths): + """ + We tried to remove the python library root path in order to give a reasonable expression of the module path. + For example, /tmp/bin/python/site-packages/great_app/simple_expansions/simple_interface.py + will get turned into great_app.simple_expansions.simple_interface given that the syspath contains + /tmp/bin/python/site-packages + """ + module_path = file_path + + if platform.system() == "Windows": + # In Windows, separator can either be / or \ from experimental result + file_path = file_path.replace("/", os.sep) + + for root in sys_paths: + if root in file_path: + module_path = file_path.replace(root, "") + break + + # remove suffix + module_path = str(Path(module_path).with_suffix("")) + # remove drive (applicable for WINDOWS customers) + module_path = os.path.splitdrive(module_path)[1] + + module_path = module_path.replace(os.sep, ".") + + if module_path.startswith("."): + module_path = module_path[1:] + + return module_path + + +class ProfileEncoder: + """ + Encodes a given Profile into the JSON version of the ion-based profile format + + For efficiency, this encoder: + * compresses the output as gzip, thus reducing the size of the final result + * writes the output incrementally to a stream, thus reduces the footprint of the conversion + * implements its own json writer: + * enables the incremental stream-based output + * decouples our in-memory implementation from the current ion format + """ + + def __init__(self, environment=dict(), gzip=True): + self._gzip = gzip + self._agent_metadata = environment["agent_metadata"] + self._module_path_extractor = self.ModulePathExtractor(environment.get("sys_path") or sys.path) + + def encode(self, profile, output_stream): + if self._gzip: + output_stream = self._gzip_stream_from(output_stream) + + output_stream.write( + self.InnerProfileEncoder(profile, self._agent_metadata, self._module_path_extractor) + .encode_content().encode("utf-8") + ) + + if self._gzip: + output_stream.close() + + def _gzip_stream_from(self, stream): + return gzip.GzipFile( + fileobj=stream, + mode="wb", + compresslevel=GZIP_BALANCED_COMPRESSION_LEVEL) + + class ModulePathExtractor: + def __init__(self, sys_path=[], extractor_fun=_get_module_path): + self._sys_path = sorted(sys_path, key=len, reverse=True) + self._extractor_fun = extractor_fun + + @lru_cache(maxsize=128) + def get_module_path(self, file_path): + if file_path is None: + return None + return self._extractor_fun(file_path, self._sys_path) + + class InnerProfileEncoder: + def __init__(self, profile, agent_metadata, module_path_extractor): + self._profile = profile + self._agent_metadata = agent_metadata + self._module_path_extractor = module_path_extractor + + def encode_content(self): + profile_in_map = { + "start": int(self._profile.start), + "end": int(self._profile.end), + "agentMetadata": self._encode_agent_metadata(), + "callgraph": self._encode_call_graph(self._profile.callgraph) + } + + return json.dumps(profile_in_map) + + def _encode_agent_metadata(self): + profile_duration_seconds = self._profile.get_active_millis_since_start() / 1000.0 + sample_weight = 1.0 if (profile_duration_seconds == 0) else self._profile.total_sample_count / profile_duration_seconds + average_num_threads = 0.0 if (self._profile.total_sample_count == 0) else (self._profile.total_seen_threads_count / self._profile.total_sample_count) + + return self._agent_metadata.serialize_to_json( + sample_weight=sample_weight, + duration_ms=self._profile.get_active_millis_since_start(), + cpu_time_seconds=self._profile.cpu_time_seconds, + average_num_threads=average_num_threads, + memory_usage_mb=self._convert_to_mb(self._profile.get_memory_usage_bytes()), + overhead_ms=self._profile.overhead_ms + ) + + def _convert_to_mb(self, bytes_to_convert): + return bytes_to_convert / (1024 * 1024) + + def _encode_call_graph(self, call_graph): + return self._encode_node_recursive(call_graph) + + def _convert_line_range(self, node): + if node.start_line is None: + return None + + if node.start_line == node.end_line: + return { + "line": [node.start_line] + } + else: + return { + "line": [node.start_line, node.end_line] + } + + def _convert_file_path(self, node): + if node.file_path is None: + return None + if platform.system() == "Windows": + # In Windows, separator can either be / or \ from experimental result + file_path = node.file_path.replace("/", os.sep) + else: + file_path = node.file_path + return { + "file": file_path + } + + def _convert_runnable_count(self, node): + if node.runnable_count > 0: + return { + "counts": { + "WALL_TIME": node.runnable_count + } + } + return None + + def _encode_node_recursive(self, node): + node_map = {} + runnable_count_map = self._convert_runnable_count(node) + if runnable_count_map: + node_map.update(runnable_count_map) + file_path_map = self._convert_file_path(node) + if file_path_map: + node_map.update(file_path_map) + line_range_map = self._convert_line_range(node) + if line_range_map: + node_map.update(line_range_map) + + if node.children: + node_map["children"] = self._encode_children_nodes_recursive(node.children) + + return node_map + + def _encode_children_nodes_recursive(self, children_nodes): + node_map = {} + for child_node in children_nodes: + frame = DEFAULT_FRAME_COMPONENT_DELIMITER.join( + list(filter(None, [self._module_path_extractor.get_module_path(child_node.file_path), + child_node.class_name, child_node.frame_name]))) + child_node_map = self._encode_node_recursive(child_node) + node_map[frame] = child_node_map + + return node_map + + # Useful for debugging, converts a profile into a prettified JSON output + @staticmethod + def debug_pretty_encode(profile, environment=dict(), sort_keys=True): + stream = io.BytesIO() + ProfileEncoder(gzip=False, environment=environment).encode(profile, stream) + parsed_json = json.loads(stream.getvalue().decode("utf-8")) + return json.dumps(parsed_json, indent=2, sort_keys=sort_keys) diff --git a/codeguru_profiler_agent/sdk_reporter/sdk_reporter.py b/codeguru_profiler_agent/sdk_reporter/sdk_reporter.py new file mode 100644 index 0000000..d573ead --- /dev/null +++ b/codeguru_profiler_agent/sdk_reporter/sdk_reporter.py @@ -0,0 +1,100 @@ +# -*- coding: utf-8 -*- + +import logging +import io + +from botocore.exceptions import ClientError +from codeguru_profiler_agent.utils.log_exception import log_exception +from codeguru_profiler_agent.reporter.reporter import Reporter +from codeguru_profiler_agent.metrics.with_timer import with_timer +from codeguru_profiler_agent.sdk_reporter.profile_encoder import ProfileEncoder + +logger = logging.getLogger(__name__) + + +class SdkReporter(Reporter): + """ + Handles communication with the CodeGuru Profiler Service backend. + Encodes profiles using the ProfilerEncoder and reports them using the CodeGuru profiler SDK. + """ + + def __init__(self, environment): + """ + :param environment: dependency container dictionary for the current profiler. + :param profiling_group_name: (required inside environment) name of the profiling group. + :param codeguru_profiler_client: (required inside environment) sdk client for CodeGuru Profiler calls. + """ + self.profiling_group_name = environment["profiling_group_name"] + self.codeguru_client_builder = environment["codeguru_profiler_builder"] + # TODO decide if we need to compress with gzip or not. + self.profile_encoder = \ + environment.get("profile_encoder") or ProfileEncoder(environment=environment, gzip=False) + self.timer = environment.get("timer") + self.metadata = environment["agent_metadata"] + self.agent_config_merger = environment["agent_config_merger"] + + def _encode_profile(self, profile): + output_profile_stream = io.BytesIO() + self.profile_encoder.encode( + profile=profile, output_stream=output_profile_stream) + output_profile_stream.seek(0) + return output_profile_stream + + @with_timer("setupSdkReporter", measurement="wall-clock-time") + def setup(self): + """ + Initialize expensive resources. + """ + self.codeguru_client_builder.codeguru_client + + @with_timer("refreshConfiguration", measurement="wall-clock-time") + def refresh_configuration(self): + """ + Refresh the agent configuration by calling the profiler backend service. + """ + try: + fleet_instance_id = self.metadata.fleet_info.get_fleet_instance_id() + metadata = self.metadata.fleet_info.get_metadata_for_configure_agent_call() + configuration = self.codeguru_client_builder.codeguru_client.configure_agent( + fleetInstanceId=fleet_instance_id, + metadata=metadata if metadata is not None else {}, + profilingGroupName=self.profiling_group_name + ).get('configuration') + logger.debug("Got response from backend for configure_agent operation: " + str(configuration)) + self.agent_config_merger.merge_with(configure_agent_response=configuration) + except ClientError as error: + # If we get a validation error or the profiling group does not exists, do not profile. We do not stop the + # whole process because the customer may fix this on their side by creating/changing the profiling group. + # We handle service exceptions like this in boto3 + # see https://boto3.amazonaws.com/v1/documentation/api/latest/guide/error-handling.html + if error.response['Error']['Code'] in ['ResourceNotFoundException', 'ValidationException']: + self.agent_config_merger.disable_profiling() + self._log_request_failed(operation="configure_agent", exception=error) + except Exception as e: + self._log_request_failed(operation="configure_agent", exception=e) + + @with_timer("report", measurement="wall-clock-time") + def report(self, profile): + """ + Report profile to the profiler backend service. + + :param profile: Profile to be encoded and reported to the profiler backend service. + :return: True if profile gets reported successfully; False otherwise. + """ + try: + profile_stream = self._encode_profile(profile) + self.codeguru_client_builder.codeguru_client.post_agent_profile( + agentProfile=profile_stream, + contentType='application/json', + profilingGroupName=self.profiling_group_name + ) + logger.info("Reported profile successfully") + return True + except Exception as e: + self._log_request_failed(operation="post_agent_profile", exception=e) + return False + + @staticmethod + def _log_request_failed(operation, exception): + log_exception(logger, "Failed to call the CodeGuru Profiler service for the {} operation: {}" + .format(operation, str(exception))) diff --git a/codeguru_profiler_agent/utils/__init__.py b/codeguru_profiler_agent/utils/__init__.py new file mode 100644 index 0000000..b6e690f --- /dev/null +++ b/codeguru_profiler_agent/utils/__init__.py @@ -0,0 +1 @@ +from . import * diff --git a/codeguru_profiler_agent/utils/execution_state.py b/codeguru_profiler_agent/utils/execution_state.py new file mode 100644 index 0000000..8bd8404 --- /dev/null +++ b/codeguru_profiler_agent/utils/execution_state.py @@ -0,0 +1,143 @@ +from __future__ import absolute_import + +import time +import datetime + +from queue import Queue, Empty + + +class ExecutionState: + RUNNING = "RUNNING" + PAUSED = "PAUSED" + STOPPED = "STOPPED" + + def __init__(self, + delay_provider=None, + initial_delay=datetime.timedelta(), + state_changes_queue=None, + clock=time.time): + """ + This class keeps track of the state of execution for the scheduler, + and it handles the delay between executions. + The user thread should be the only one changing the state with signal_* calls. + The profiler sampling thread should only read the state and call wait_for_next_tick_or_stop() which + waits for the appropriate delay and handles pause and resume signals. + :param delay_provider: function that provides the delay as timedelta, default returns always 1s + :param initial_delay: time we have to wait before the first execution, default is empty timedelta. + :param state_changes_queue: the queue in which signals for changes are going. used only for unit tests. + :param clock: a function to return current time, used only for unit tests. default is time.time + """ + self._state_changes = state_changes_queue or Queue() + self._current_state = ExecutionState.RUNNING + self._clock = clock + self.initial_delay = initial_delay + self._already_waited_seconds = 0.0 + self._time_when_waiting_started = None + self.delay_provider = delay_provider if delay_provider else lambda: datetime.timedelta(seconds=1) + + def signal_resume(self, block=False): + self._state_changes.put({ + "state": ExecutionState.RUNNING, + "current_time": self._clock() + }) + if block: + self._state_changes.join() + + def signal_pause(self, block=False): + self._state_changes.put({ + "state": ExecutionState.PAUSED, + "current_time": self._clock() + }) + if block: + self._state_changes.join() + + def signal_stop(self, block=False): + self._state_changes.put({"state": ExecutionState.STOPPED}) + if block: + self._state_changes.join() + + def set_stopped(self): + """ + Contrary to signal_stop, this should not be used by the user thread. + This is used by sampling thread to make sure the state is set to STOPPED + when it is stopping for other reasons than a user's signal_stop() call. + """ + self._current_state = ExecutionState.STOPPED + + def is_paused(self): + return self._current_state is ExecutionState.PAUSED + + def is_stopped(self): + return self._current_state is ExecutionState.STOPPED + + def next_delay_seconds(self): + """ + returns the delay in seconds that we are going to wait next + """ + if self.initial_delay is None: + return self.delay_provider().total_seconds() + else: + return self.initial_delay.total_seconds() + + def _wait_for_execution_time(self): + """ + wait for next execution time or for any change of status. + The first time this is called, if initial_delay is an empty timedelta + we return immediately without even checking the queue to avoid the small delay. + + :return: The new status change or None if it is time to execute. + """ + next_delay_seconds = self.next_delay_seconds() + if next_delay_seconds == 0: + return None + try: + wait_time = max(0.0, next_delay_seconds - self._already_waited_seconds) + return self._state_changes.get(block=True, timeout=wait_time) + except Empty: + return None + + def _wait_for_resume(self): + return self._state_changes.get(block=True) + + def _apply_new_state(self, new_state): + """ + Update the current state variable and save the waited time if we paused. + + :return True if there was a change in the state, False otherwise. + """ + if not new_state: + return False + self._current_state = new_state["state"] + if self._current_state is ExecutionState.PAUSED: + # if we paused, we save how much time we have already waited so we know how much longer we need to wait. + # use the time provided by signal_* call as reference in case we were late to catch up with pause signal. + now = new_state["current_time"] + self._already_waited_seconds += max(0, now - self._time_when_waiting_started) + return True + + def wait_for_next_tick_or_stop(self): + """ + Wait until it is time to execute or the process is stopped. + Status should either be RUNNING or STOPPED at the end. + If initial_delay was set to 0 this will return True even if we + have immediately paused or stopped. + + :return: True if the status is RUNNING at the end. False otherwise. + """ + is_time_to_execute = False + while self._current_state is not ExecutionState.STOPPED and not is_time_to_execute: + self._time_when_waiting_started = self._clock() + if self._current_state is ExecutionState.PAUSED: + new_state = self._wait_for_resume() + else: + new_state = self._wait_for_execution_time() + # update state, if there is no change in the state after waiting then it is time to execute. + is_time_to_execute = not self._apply_new_state(new_state) + if new_state: + self._state_changes.task_done() + + # if we exit the while loop that means either we are stopped or it is time to execute + # remove the initial_delay as we do not need it anymore, also reset _already_waited_seconds + self.initial_delay = None + self._already_waited_seconds = 0.0 + return self._current_state is ExecutionState.RUNNING diff --git a/codeguru_profiler_agent/utils/log_exception.py b/codeguru_profiler_agent/utils/log_exception.py new file mode 100644 index 0000000..4f7fed7 --- /dev/null +++ b/codeguru_profiler_agent/utils/log_exception.py @@ -0,0 +1,4 @@ + +def log_exception(logger, message): + logger.info(message) + logger.debug("Caught exception: ", exc_info=True) diff --git a/codeguru_profiler_agent/utils/scheduler.py b/codeguru_profiler_agent/utils/scheduler.py new file mode 100644 index 0000000..346cff3 --- /dev/null +++ b/codeguru_profiler_agent/utils/scheduler.py @@ -0,0 +1,108 @@ +from __future__ import absolute_import +import threading +import logging +import datetime + +from codeguru_profiler_agent.utils.execution_state import ExecutionState + +logger = logging.getLogger(__name__) + +DEFAULT_TIME_TO_AWAIT_TERMINATION_SECONDS = 2 + + +class Scheduler: + def __init__(self, + command, + delay_provider=None, + initial_delay=datetime.timedelta(), + thread_name=None, + args=None, + kwargs=None): + """ + Creates and executes a periodic action that will be run first without any delay and subsequently with the + given delay between the termination of one execution and the commencement of the next. + Scheduled thread gets terminated if exception thrown, hence subsequent execution will not happen. + The scheduler can be paused, it will then keep running but not execute the command until resume() is called. + + :param command: a new thread will be spawned for executing this command with arguments + specified in args and kwargs + :param delay_provider: function providing the delay between executions as a timedelta, default returns always 1s + :param initial_delay: delay before first execution as a timedelta, default is 0s. + :param thread_name: name of the new spawned thread + :param args: (list) passing argument by its position + :param kwargs: (dict) passing argument by the arguments' names + """ + self._command = command + self._thread = \ + threading.Thread(target=self._schedule_task_execution, name=thread_name) + self._thread.daemon = True + self._args = args if args is not None else [] + self._kwargs = kwargs if kwargs is not None else {} + self._state = ExecutionState( + delay_provider=delay_provider if delay_provider else lambda: datetime.timedelta(seconds=1), + initial_delay=initial_delay) + + def start(self): + if self.is_running(): + # nothing to do if we are already running + logger.info("Ignored Scheduler.start() as it is already running!") + return + try: + self._thread.start() + except RuntimeError: + # replace the exception from threading by ours with more explanations. + raise RuntimeError( + "Profiler cannot be started again after stop. Use a new Profiler instance or use pause() instead of stop()" + ) + + def is_running(self): + """ + This tells if the scheduler is currently running. + It still returns True when we are paused. + """ + return self._thread.is_alive() + + def is_paused(self): + return self.is_running() and self._state.is_paused() + + def stop(self): + """ + Stop the scheduled thread from executing the command and wait for termination. + """ + self._state.signal_stop() + if not self.is_running(): + return + self._thread.join(DEFAULT_TIME_TO_AWAIT_TERMINATION_SECONDS) + + def _schedule_task_execution(self): + should_run = self._state.wait_for_next_tick_or_stop() + while should_run: + should_run = \ + self._command(*self._args, **self._kwargs) and self._state.wait_for_next_tick_or_stop() + # call set_stopped in case it is the command that returned False. + self._state.set_stopped() + + def update_delay_provider(self, delay_provider): + self._state.delay_provider = delay_provider + + def resume(self, block=False): + """ + Will signal the sampling thread that profiling should resume. + + :param block: if True, we will not return from this function before the change is applied, default is False. + """ + self._state.signal_resume(block) + + def pause(self, block=False): + """ + Will signal the sampling thread that profiling should pause. + + :param block: if True, we will not return from this function before the change is applied, default is False. + """ + self._state.signal_pause(block) + + def _get_next_delay_seconds(self): + """ + Useful for testing + """ + return self._state.next_delay_seconds() diff --git a/codeguru_profiler_agent/utils/time.py b/codeguru_profiler_agent/utils/time.py new file mode 100644 index 0000000..5ebb3ec --- /dev/null +++ b/codeguru_profiler_agent/utils/time.py @@ -0,0 +1,13 @@ +from __future__ import absolute_import + +import time +from datetime import datetime + +def to_iso(epoch_milli): + try: + return datetime.fromtimestamp(epoch_milli / 1000).isoformat() + except ValueError: + return str(epoch_milli) + +def current_milli_time(clock=time.time): + return int(clock() * 1000) diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..a6f4260 --- /dev/null +++ b/requirements.txt @@ -0,0 +1 @@ +boto3>=1.14.0 diff --git a/setup.py b/setup.py new file mode 100755 index 0000000..d22074b --- /dev/null +++ b/setup.py @@ -0,0 +1,48 @@ +import codecs +import os +import re + +from setuptools import setup, find_packages + +REQUIREMENTS = [i.strip() for i in open("requirements.txt").readlines()] + +here = os.path.abspath(os.path.dirname(__file__)) + + +def read(*parts): + return codecs.open(os.path.join(here, *parts), 'r').read() + + +def find_version(*file_paths): + version_file = read(*file_paths) + version_match = re.search(r"^__agent_version__ = ['\"]([^'\"]*)['\"]", version_file, re.M) + if version_match: + return version_match.group(1) + raise RuntimeError("Unable to find version string.") + + +# https://packaging.python.org/tutorials/packaging-projects/#creating-the-package-files +setup( + name="codeguru_profiler_agent", + version=find_version("codeguru_profiler_agent/agent_metadata", "agent_metadata.py"), + packages=find_packages(exclude="test"), + + description="The Python agent to be used for Amazon CodeGuru Profiler", + long_description="https://docs.aws.amazon.com/codeguru/latest/profiler-ug/what-is-codeguru-profiler.html", + author="Amazon Web Services", + url="https://github.com/aws", + download_url="https://github.com/aws", + classifiers=[ + "Programming Language :: Python :: 3", + "Development Status :: 5 - Production/Stable", + "Topic :: Utilities", + "License :: OSI Approved :: Apache Software License" + ], + + python_requires='>=3.6', + # The Lambda layer doesn't use this file to install the following needed packages, + # so we have to make sure the customer has them set in place (by default or by updating the docs to install them). + # - boto3 is already included in the Lambda Runtime for Python + # https://docs.aws.amazon.com/lambda/latest/dg/lambda-python.html. + install_requires=REQUIREMENTS +) diff --git a/test/__init__.py b/test/__init__.py new file mode 100644 index 0000000..a8386ba --- /dev/null +++ b/test/__init__.py @@ -0,0 +1,3 @@ +from . import help_utils +import logging +logging.getLogger('codeguru_profiler_agent').setLevel(logging.DEBUG) diff --git a/test/acceptance/test_cpu_limit.py b/test/acceptance/test_cpu_limit.py new file mode 100644 index 0000000..e413a21 --- /dev/null +++ b/test/acceptance/test_cpu_limit.py @@ -0,0 +1,36 @@ +from datetime import timedelta + +from codeguru_profiler_agent.metrics.timer import Timer +from codeguru_profiler_agent.profiler import Profiler +from codeguru_profiler_agent.agent_metadata.agent_metadata import AgentMetadata, DefaultFleetInfo +from test.help_utils import wait_for, DUMMY_TEST_PROFILING_GROUP_NAME +from test.pytestutils import before + + +class TestCPULimit: + class TestCPULimitReachedDuringProfiling: + @before + def before(self): + self.timer = Timer() + self.profiler = Profiler( + profiling_group_name=DUMMY_TEST_PROFILING_GROUP_NAME, + environment_override={ + "timer": self.timer, + "cpu_limit_percentage": 40, + "sampling_interval": timedelta(seconds=0.01), + 'agent_metadata': AgentMetadata(fleet_info=DefaultFleetInfo()) + }, + ) + yield + self.profiler.stop() + + def test_profiler_terminates(self): + self.profiler.start() + assert self.profiler.is_running() + + # With sampling_interval to be 0.01 seconds, having runProfiler as 0.5 seconds should breach + # the cpu limit. We need to sample 20 times before we check the CPU limit + for i in range(20): + self.timer.record('runProfiler', 0.5) + + assert wait_for(lambda: not self.profiler.is_running(), timeout_seconds=5) diff --git a/test/acceptance/test_end_to_end_profile_and_save_to_file.py b/test/acceptance/test_end_to_end_profile_and_save_to_file.py new file mode 100644 index 0000000..004b74a --- /dev/null +++ b/test/acceptance/test_end_to_end_profile_and_save_to_file.py @@ -0,0 +1,102 @@ +import json +import platform +import shutil +import tempfile +import os + +from datetime import timedelta +from mock import patch +from pathlib import Path + +from codeguru_profiler_agent.profiler import Profiler +from codeguru_profiler_agent.utils import time as time_utils +from codeguru_profiler_agent.agent_metadata.agent_metadata import AgentMetadata, DefaultFleetInfo +from test.help_utils import HelperThreadRunner, DUMMY_TEST_PROFILING_GROUP_NAME, FILE_PREFIX +from test.pytestutils import before + + +def frames_in_callgraph_are_in_expected_order(node, parent_frame, child_frame): + if not "children" in node: + return False + for child_node in node["children"]: + if parent_frame in child_node: + for grand_child_node in node["children"][child_node]["children"].keys(): + if child_frame in grand_child_node: + return True + if frames_in_callgraph_are_in_expected_order( + node["children"][child_node], parent_frame, child_frame): + return True + return False + + +class TestEndToEndProfileAndSaveToFile: + @before + def before(self): + self.temporary_directory = tempfile.mkdtemp() + + helper = HelperThreadRunner() + helper.new_helper_thread_blocked_inside_dummy_method() + + yield + + helper.stop_helper_thread() + + shutil.rmtree(self.temporary_directory) + + def test_it_samples_and_saves_a_profile_to_a_file(self): + with \ + patch( + "codeguru_profiler_agent.reporter.agent_configuration.AgentConfiguration.is_under_min_reporting_time", + return_value=False): + + file_prefix = str(Path(self.temporary_directory, FILE_PREFIX)) + + test_start_time = time_utils.current_milli_time() + + profiler = Profiler( + profiling_group_name=DUMMY_TEST_PROFILING_GROUP_NAME, + environment_override={ + "initial_sampling_interval": timedelta(), + "reporting_mode": "file", + "file_prefix": file_prefix, + 'agent_metadata': AgentMetadata(fleet_info=DefaultFleetInfo()) + }) + + try: + profiler.start() + finally: + profiler.stop() + + test_end_time = time_utils.current_milli_time() + + resulting_profile_path = str( + Path(self.temporary_directory, + os.listdir(self.temporary_directory)[0])) + + with (open(resulting_profile_path)) as profiling_result_file: + resulting_json = json.loads(profiling_result_file.read()) + + self.assert_valid_agent_metadata(resulting_json["agentMetadata"]) + assert test_start_time <= resulting_json["start"] <= resulting_json["end"] <= test_end_time + assert frames_in_callgraph_are_in_expected_order( + resulting_json["callgraph"], + "test.help_utils:HelperThreadRunner:dummy_parent_method", + "test.help_utils:HelperThreadRunner:dummy_method") + + @staticmethod + def assert_valid_agent_metadata(agent_metadata): + assert agent_metadata["agentInfo"] + assert agent_metadata["fleetInfo"] + assert agent_metadata["runtimeVersion"] + assert agent_metadata["sampleWeights"] + assert agent_metadata["agentOverhead"] + assert agent_metadata["durationInMs"] + assert agent_metadata["sampleWeights"]["WALL_TIME"] + assert agent_metadata["agentOverhead"]["memory_usage_mb"] + + if platform.system() != "Windows": + # Due to the issue mentioned on https://bugs.python.org/issue37859, we would skip checking agentOverhead for + # Windows system as the agent is only run for very short period of time. We may improve the accuracy of + # measuring the overhead by using time.perf_counter_ns for Windows in the future. + assert agent_metadata["agentOverhead"]["timeInMs"] + assert agent_metadata["cpuTimeInSeconds"] > 0 diff --git a/test/acceptance/test_end_to_end_profiling.py b/test/acceptance/test_end_to_end_profiling.py new file mode 100644 index 0000000..771df02 --- /dev/null +++ b/test/acceptance/test_end_to_end_profiling.py @@ -0,0 +1,53 @@ +from botocore.stub import Stubber, ANY +from datetime import timedelta +from mock import patch +from test.pytestutils import before + +from codeguru_profiler_agent.profiler import Profiler +from codeguru_profiler_agent.agent_metadata.agent_metadata import AgentMetadata, DefaultFleetInfo +from test.help_utils import DUMMY_TEST_PROFILING_GROUP_NAME + + +class TestEndToEndProfiling: + @before + def before(self): + self.profiler = Profiler( + profiling_group_name=DUMMY_TEST_PROFILING_GROUP_NAME, + environment_override={'initial_sampling_interval': timedelta(), + 'agent_metadata': AgentMetadata(fleet_info=DefaultFleetInfo())} + ) + self.client_stubber = Stubber( + self.profiler._profiler_runner.collector.reporter.codeguru_client_builder.codeguru_client) + report_expected_params = { + 'agentProfile': ANY, + 'contentType': 'application/json', + 'profilingGroupName': DUMMY_TEST_PROFILING_GROUP_NAME + } + self.client_stubber.add_response('configure_agent', + {'configuration': {'shouldProfile': True, 'periodInSeconds': 100}}) + self.client_stubber.add_response('post_agent_profile', {}, report_expected_params) + yield + self.profiler.stop() + + def test_report_when_stopped(self): + with \ + patch( + "codeguru_profiler_agent.reporter.agent_configuration.AgentConfiguration.is_under_min_reporting_time", + return_value=False): + with self.client_stubber: + self.profiler.start() + self.profiler.stop() # stop will trigger the report + + # check that we have reported. assert_no_pending_responses will fail if post_agent_profile is not called + # with the expected parameters. + self.client_stubber.assert_no_pending_responses() + + def test_report_via_invalid_reporting_mode(self): + self.profiler = Profiler( + profiling_group_name=DUMMY_TEST_PROFILING_GROUP_NAME, + environment_override={ + "reporting_mode": "invalid_reporting_mode", + }) + + self.profiler.start() + assert self.profiler.is_running() is False diff --git a/test/acceptance/test_kill_switch.py b/test/acceptance/test_kill_switch.py new file mode 100644 index 0000000..f032220 --- /dev/null +++ b/test/acceptance/test_kill_switch.py @@ -0,0 +1,55 @@ +import tempfile +import shutil + +from datetime import timedelta +from pathlib import Path + +from codeguru_profiler_agent.profiler import Profiler +from codeguru_profiler_agent.agent_metadata.agent_metadata import AgentMetadata, DefaultFleetInfo +from test.help_utils import wait_for, DUMMY_TEST_PROFILING_GROUP_NAME +from test.pytestutils import before + + +class TestKillSwitch: + class TestKillSwitchActivatesAtStart: + def test_profiler_does_not_start(self): + with tempfile.NamedTemporaryFile() as killswitch_file: + profiler = Profiler( + profiling_group_name=DUMMY_TEST_PROFILING_GROUP_NAME, + environment_override={"killswitch_filepath": killswitch_file.name}) + + try: + assert profiler.start() is False + finally: + profiler.stop() + + class TestKillSwitchActivatesDuringExecution: + @before + def before(self): + temporary_directory = tempfile.mkdtemp() + self.temp_filepath = str( + Path(temporary_directory, + 'test_profiler_stops_after_killswitch_was_detected')) + + self.profiler = Profiler( + profiling_group_name="test-application", + environment_override={ + "cpu_limit_percentage": None, + "killswitch_filepath": self.temp_filepath, + "sampling_interval": timedelta(seconds=0.1), + 'agent_metadata': AgentMetadata(fleet_info=DefaultFleetInfo()) + }) + yield + shutil.rmtree(temporary_directory) + self.profiler.stop() + + def test_profiler_stops_after_killswitch_was_detected(self): + self.profiler.start() + assert self.profiler.is_running() is True + + Path(self.temp_filepath).touch() + + # Force the killswitch check happens immediately + self.profiler._profiler_runner_instance.profiler_disabler.killswitch.last_check_for_file_time = None + + assert (wait_for(lambda: not self.profiler.is_running(), timeout_seconds=5)) diff --git a/test/acceptance/test_live_profiling.py b/test/acceptance/test_live_profiling.py new file mode 100644 index 0000000..6f4d474 --- /dev/null +++ b/test/acceptance/test_live_profiling.py @@ -0,0 +1,77 @@ +import time + +from datetime import timedelta +from mock import patch + +from codeguru_profiler_agent.reporter.agent_configuration import AgentConfiguration +from codeguru_profiler_agent.profiler import Profiler +from codeguru_profiler_agent.agent_metadata.agent_metadata import AgentMetadata, DefaultFleetInfo +from test.help_utils import DUMMY_TEST_PROFILING_GROUP_NAME + + +class TestLiveProfiling: + + def test_live_profiling(self): + with \ + patch( + "codeguru_profiler_agent.reporter.agent_configuration.AgentConfiguration.is_under_min_reporting_time", + return_value=False), \ + patch( + "codeguru_profiler_agent.reporter.agent_configuration.AgentConfiguration._is_reporting_interval_smaller_than_minimum_allowed", + return_value=False): + + profiler = Profiler( + profiling_group_name=DUMMY_TEST_PROFILING_GROUP_NAME, + region_name='eu-west-2', + environment_override={"initial_sampling_interval": timedelta(), + "sampling_interval": timedelta(seconds=1), + "reporting_interval": timedelta(seconds=2), + 'agent_metadata': AgentMetadata(fleet_info=DefaultFleetInfo())} + ) + + client = profiler._profiler_runner.collector.reporter.codeguru_client_builder.codeguru_client + aggregator = profiler._profiler_runner.collector + + assert AgentConfiguration.get().sampling_interval == timedelta(seconds=1) + assert AgentConfiguration.get().reporting_interval == timedelta(seconds=2) + + with \ + patch.object(client, "post_agent_profile", + wraps=client.post_agent_profile) as wrapped_post_agent_profile, \ + patch.object(client, "configure_agent", + wraps=client.configure_agent) as wrapped_configure_agent, \ + patch.object(aggregator, "add", + wraps=aggregator.add) as wrapped_add, \ + patch( + "codeguru_profiler_agent.reporter.agent_configuration.AgentConfiguration.is_under_min_reporting_time", + return_value=False), \ + patch( + "codeguru_profiler_agent.reporter.agent_configuration.AgentConfiguration._is_reporting_interval_smaller_than_minimum_allowed", + return_value=False): + + wrapped_configure_agent.return_value = { + "configuration": { + "agentParameters": { + "SamplingIntervalInMilliseconds": "100", + "MinimumTimeForReportingInMilliseconds": "1000", + "MaxStackDepth": "1000", + "MemoryUsageLimitPercent": "29" + }, + "periodInSeconds": 2, + "shouldProfile": True + } + } + + try: + start_status = profiler.start() + assert start_status + assert profiler.is_running() + time.sleep(3) + finally: + profiler.stop() + + assert wrapped_add.call_count >= 3 + assert wrapped_post_agent_profile.call_count >= 1 + assert wrapped_configure_agent.call_count >= 1 + assert AgentConfiguration.get().sampling_interval == timedelta(seconds=1) + assert AgentConfiguration.get().reporting_interval == timedelta(seconds=2) diff --git a/test/conftest.py b/test/conftest.py new file mode 100644 index 0000000..a192787 --- /dev/null +++ b/test/conftest.py @@ -0,0 +1,4 @@ +import pytest + +# Enable pytestutils @focus decoration +pytest_plugins = "test.pytestutils_focus" diff --git a/test/help_utils.py b/test/help_utils.py new file mode 100644 index 0000000..5b63700 --- /dev/null +++ b/test/help_utils.py @@ -0,0 +1,61 @@ +import threading +import time +from queue import Queue + +INTEGRATION_TEST_ACCOUNT_ID = "519630429520" +MY_PROFILING_GROUP_NAME_FOR_INTEG_TESTS = "MyProfilingGroupForIntegrationTests" +DUMMY_TEST_PROFILING_GROUP_NAME = "DummyNameThatWillNotBeUsed" + +FILE_PREFIX = "pytest-CodeGuruPythonAgent" + + +class HelperThreadRunner: + def __init__(self): + pass + + def new_helper_sleep_thread(self, sleep_sec=1, thread_name="test-sleep-thread"): + self.sleep_thread = threading.Thread( + name=thread_name, target=self.sleep_for, daemon=True, kwargs={"sleep_sec": sleep_sec}) + self.sleep_thread.start() + + def sleep_for(self, sleep_sec): + time.sleep(sleep_sec) + + def new_helper_thread_blocked_inside_dummy_method( + self, thread_name="test-thread"): + self.ready_queue = Queue() + self.done_queue = Queue() + + self.thread = threading.Thread( + name=thread_name, target=self.dummy_parent_method) + self.thread.start() + self.ready_queue.get() + + def stop_helper_thread(self): + self.done_queue.put(True) + self.thread.join() + + def dummy_method(self): + """ + Running this function in a thread provides us a test stack to compare against. + """ + self.ready_queue.put(True) + self.done_queue.get() + + def dummy_parent_method(self): + """ + By creating a function calling the long_running_test_process, we create an ordered stack for testing. + """ + self.dummy_method() + + +def wait_for(condition, timeout_seconds=1.0, poll_interval_seconds=0.01): + """ + Timed out waiting for condition to be true + """ + end_time = time.time() + timeout_seconds + while time.time() < end_time: + if condition(): + return True + time.sleep(poll_interval_seconds) + raise Exception("wait_for: Timed out waiting for condition to be true") diff --git a/test/integration/README.md b/test/integration/README.md new file mode 100644 index 0000000..40f5959 --- /dev/null +++ b/test/integration/README.md @@ -0,0 +1,26 @@ +# Overview + +The tests from this folder are marked to be skipped when run on shared build fleet because they require AWS credentials. + +# Run locally + +For running them locally on your development machine, follow the next steps. + +1. Make sure you have installed the latest version of [AWS CLI](https://docs.aws.amazon.com/cli/latest/userguide/cli-chap-welcome.html). + +2. Use an IAM entity for the AWS CLI that has permissions to access CodeGuru Profiler. + ``` + aws configure # Set up your AWS credentials and region as usual. + ``` + +3. Create a PG with the name ``MyProfilingGroupForIntegrationTests`` + ``` + aws codeguruprofiler create-profiling-group --profiling-group-name MyProfilingGroupForIntegrationTests + ``` + +4. To run the integration tests with logs enabled, run the following command. + ``` + pytest -v -o log_cli=true test/integration + ``` + +5. Consider including the output in the PR for an easier review. \ No newline at end of file diff --git a/test/integration/test_live_backend_reporting.py b/test/integration/test_live_backend_reporting.py new file mode 100644 index 0000000..a911239 --- /dev/null +++ b/test/integration/test_live_backend_reporting.py @@ -0,0 +1,124 @@ +import boto3 +import pytest +import socket +import time + +from datetime import timedelta + +from test.help_utils import MY_PROFILING_GROUP_NAME_FOR_INTEG_TESTS +from test.pytestutils import before + +from codeguru_profiler_agent.model.frame import Frame +from codeguru_profiler_agent.reporter.agent_configuration import AgentConfiguration, AgentConfigurationMerger +from codeguru_profiler_agent.agent_metadata.agent_metadata import AgentMetadata, DefaultFleetInfo +from codeguru_profiler_agent.sdk_reporter.sdk_reporter import SdkReporter +from codeguru_profiler_agent.codeguru_client_builder import CodeGuruClientBuilder +from codeguru_profiler_agent.model.profile import Profile +from codeguru_profiler_agent.model.sample import Sample + + +@pytest.mark.skipif( + # TODO FIXME Remove the conditions for skipping this on Amazonian fleets when we move fully to GitHub. + socket.gethostname().startswith("pb-worker-prod") or + socket.gethostname().startswith("coverlay-") or + socket.getfqdn().endswith("internal.cloudapp.net"), # hosts running ubuntu and windows in GitHub + socket.getfqdn().endswith("ip6.arpa"), # hosts running macs in GitHub + reason="This integration test is skipped on any shared fleet from Amazon or GitHub " + "because it needs credentials to access the backend service. " + "For information on how to run this locally, read the README.md file from the test/integration/ folder.") +class TestLiveBackendReporting: + @before + def before(self): + now_millis = int(time.time()) * 1000 + five_minutes_ago_millis = now_millis - (5 * 60 * 1000) + sample = Sample( + stacks=[[Frame(MY_PROFILING_GROUP_NAME_FOR_INTEG_TESTS)]], + attempted_sample_threads_count=1, + seen_threads_count=1) + + self.profile = Profile(MY_PROFILING_GROUP_NAME_FOR_INTEG_TESTS, 1.0, 1.0, five_minutes_ago_millis) + # FIXME: Remove adding the end time manually below after feature fully support + self.profile.end = now_millis + self.profile.add(sample) + + self.environment = { + "should_profile": True, + "profiling_group_name": MY_PROFILING_GROUP_NAME_FOR_INTEG_TESTS, + "aws_session": boto3.session.Session(), + "reporting_interval": timedelta(minutes=13), + "sampling_interval": timedelta(seconds=1), + "minimum_time_reporting": timedelta(minutes=6), + "max_stack_depth": 2345, + "cpu_limit_percentage": 29, + "agent_metadata": AgentMetadata(fleet_info=DefaultFleetInfo()) + } + self.environment["codeguru_profiler_builder"] = CodeGuruClientBuilder(self.environment) + self.agent_config = AgentConfiguration( + should_profile=True, + sampling_interval=self.environment["sampling_interval"], + reporting_interval=self.environment["reporting_interval"], + minimum_time_reporting=self.environment["minimum_time_reporting"], + max_stack_depth=self.environment["max_stack_depth"], + cpu_limit_percentage=self.environment["cpu_limit_percentage"]) + + def test_beta_endpoint_call_report_and_refresh_and_do_not_override_user_overrides_agent_configuration(self): + self.environment["agent_config_merger"] = AgentConfigurationMerger(user_overrides=self.agent_config) + + sdk_reporter = SdkReporter(self.environment) + sdk_reporter.setup() + + self.assert_initial_values() + assert sdk_reporter.report(self.profile) is True + + sdk_reporter.refresh_configuration() + self.assert_initial_values() + + def test_beta_endpoint_call_report_and_refresh_and_overrides_default_agent_configuration(self): + self.environment["agent_config_merger"] = AgentConfigurationMerger(default=self.agent_config) + + sdk_reporter = SdkReporter(self.environment) + sdk_reporter.setup() + self.assert_initial_values() + assert sdk_reporter.report(self.profile) is True + + sdk_reporter.refresh_configuration() + assert AgentConfiguration.get().should_profile is True + assert AgentConfiguration.get().sampling_interval == timedelta(seconds=1) + assert AgentConfiguration.get().reporting_interval == timedelta(minutes=5) + assert AgentConfiguration.get().minimum_time_reporting == timedelta(seconds=60) + assert AgentConfiguration.get().max_stack_depth == 1000 + assert AgentConfiguration.get().cpu_limit_percentage == 10 + + def test_beta_endpoint_call_report_and_refresh_and_do_not_override_one_setting_of_default_agent_configuration(self): + self.environment["agent_config_merger"] = AgentConfigurationMerger(default=self.agent_config, + user_overrides=AgentConfiguration( + sampling_interval=timedelta(seconds=2))) + + sdk_reporter = SdkReporter(self.environment) + sdk_reporter.setup() + + assert AgentConfiguration.get().should_profile is True + assert AgentConfiguration.get().sampling_interval == timedelta(seconds=2) + assert AgentConfiguration.get().reporting_interval == timedelta(minutes=13) + assert AgentConfiguration.get().minimum_time_reporting == timedelta(minutes=6) + assert AgentConfiguration.get().max_stack_depth == 2345 + assert AgentConfiguration.get().cpu_limit_percentage == 29 + + assert sdk_reporter.report(self.profile) is True + + sdk_reporter.refresh_configuration() + assert AgentConfiguration.get().should_profile is True + assert AgentConfiguration.get().sampling_interval == timedelta(seconds=2) + assert AgentConfiguration.get().reporting_interval == timedelta(minutes=5) + assert AgentConfiguration.get().minimum_time_reporting == timedelta(seconds=60) + assert AgentConfiguration.get().max_stack_depth == 1000 + assert AgentConfiguration.get().cpu_limit_percentage == 10 + + @staticmethod + def assert_initial_values(): + assert AgentConfiguration.get().should_profile is True + assert AgentConfiguration.get().sampling_interval == timedelta(seconds=1) + assert AgentConfiguration.get().reporting_interval == timedelta(minutes=13) + assert AgentConfiguration.get().minimum_time_reporting == timedelta(minutes=6) + assert AgentConfiguration.get().max_stack_depth == 2345 + assert AgentConfiguration.get().cpu_limit_percentage == 29 diff --git a/test/pytestutils.py b/test/pytestutils.py new file mode 100644 index 0000000..cdb0d0d --- /dev/null +++ b/test/pytestutils.py @@ -0,0 +1,110 @@ +import pytest +import os +import threading + +_pytestutils_before_global_counter = 0 +_pytestutils_before_global_lock = threading.Lock() + +_pytestutils_is_there_any_test_marked_with_focus = False + + +def before(before_function): + """ + Decorator for tagging a function as running before the tests. This works by decorating it further with a + pytest.fixture(autouse=True) which makes pytest automatically trigger its execution before the tests. + + Additionally, this function keeps a global counter that's used to generate unique names for the pytest fixtures, + as otherwise they would take their name from the function being decorated, which would "break" nested use, e.g. + consider: + + class TestA: + @pytest.fixture(autouse=True) + def before(self): + print("This is the top level before") + + class TestB: + @pytest.fixture(autouse=True) + def before(self): + print("this is the inner before") + + def test_it_does_stuff(self): + print("this is the test") + + This does not run both before functions (as the inner one shadows the outer one), whereas with our decorator + + class TestA: + @custom_before + def before(self): + print("This is the top level before") + + class TestB: + @custom_before + def before(self): + print("this is the inner before") + + def test_it_does_stuff(self): + print("this is the test") + + works as expected. + """ + global _pytestutils_before_global_counter + + with _pytestutils_before_global_lock: + unique_fixture_name = \ + "wrapped_before_{}".format(_pytestutils_before_global_counter) + _pytestutils_before_global_counter += 1 + + return pytest.fixture( + autouse=True, name=unique_fixture_name)(before_function) + + +def focus(decorated_test): + """ + Decorator for tagging a function or class as being in focus, and to switch the test suite execution to "focused + execution mode". When the test suite is in "focused execution mode", only functions and classes marked with @focus + are run, all others are skipped. + + When there are no functions/classes marked with @focus, the test suite goes back to the usual mode, and all tests + are run. + + The focused execution mode is useful for quickly and without needing to edit any more configuration/files selecting + only a subset of the tests for execution, to speed up testing cycles. + + This decorator is inspired by similar IDE features (e.g. https://blogs.oracle.com/geertjan/run-focused-test-method ) + and other test libraries (e.g. https://medium.com/table-xi/focus-your-rspec-workflow-4cd5798d2a3e ). + + Limitation when used with Pytest < 3.6: @focus does not extend to nested classes, see + https://github.com/pytest-dev/pytest/issues/199 and https://github.com/pytest-dev/pytest/pull/3317 for details. + + This feature is broken into several pieces: + * The @focus decorator sets a global variable to trigger the focused execution mode, and additionally marks any + test method or class with the "pytestutils_focus" marker + * In pytestutils_focus.py a pytest plugin is provided that when in focused execution mode, skips any tests not + marked with the "pytestutils_focus" marker + * In conftest.py we enable the pytest plugin + """ + _validate_focus_enabled() + + global _pytestutils_is_there_any_test_marked_with_focus + + _pytestutils_is_there_any_test_marked_with_focus = True + + return pytest.mark.pytestutils_focus(decorated_test) + + +def _validate_focus_enabled(): + if os.environ.get("PYTESTUTILS_ALLOW_FOCUS") in ("1", "true", "yes"): + return + raise RuntimeError(""" +Found tests annotated with @focus decorator, but the PYTESTUTILS_ALLOW_FOCUS environment variable is not set in the +current environment. + +If you found this error in a CI environment, it means someone committed test code with a @focus annotation -- please +check for and remove it from the codebase. + +If you found this error and you wanted to use @focus for your own development work, please add a PYTESTUTILS_ALLOW_FOCUS +enviroment variable set to 1 (e.g. `export PYTESTUTILS_ALLOW_FOCUS=1`) to your execution environment to make this error +go away. + +Thanks for using @focus! + """) diff --git a/test/pytestutils_focus.py b/test/pytestutils_focus.py new file mode 100644 index 0000000..55eacb6 --- /dev/null +++ b/test/pytestutils_focus.py @@ -0,0 +1,8 @@ +import pytest +import test.pytestutils as pytestutils + + +def pytest_runtest_setup(item): + if pytestutils._pytestutils_is_there_any_test_marked_with_focus and not item.get_marker( + name='pytestutils_focus'): + pytest.skip("Test skipped in focus mode") diff --git a/test/unit/agent_metadata/test_agent_metadata.py b/test/unit/agent_metadata/test_agent_metadata.py new file mode 100644 index 0000000..eb34fbd --- /dev/null +++ b/test/unit/agent_metadata/test_agent_metadata.py @@ -0,0 +1,63 @@ +import pytest +from codeguru_profiler_agent.agent_metadata.agent_metadata import look_up_fleet_info, AgentInfo, AgentMetadata +from codeguru_profiler_agent.agent_metadata.aws_ec2_instance import AWSEC2Instance +from codeguru_profiler_agent.agent_metadata.fleet_info import DefaultFleetInfo, http_get + + +class TestAgentMetadata: + class TestLookUpFleetInfo: + class TestWhenMetadataIsNotAvailable: + def test_it_returns_default_fleet_info(self): + subject = look_up_fleet_info(platform_metadata_fetchers=(lambda: None,)) + + assert isinstance(subject, DefaultFleetInfo) + fleet_instance_id = subject.get_fleet_instance_id() + assert fleet_instance_id is not None + fleet_instance_serialized = subject.serialize_to_map() + assert fleet_instance_serialized["id"] == fleet_instance_id + assert fleet_instance_serialized["type"] == "UNKNOWN" + assert fleet_instance_serialized["vCPUs"] is not None + assert subject.get_metadata_for_configure_agent_call() == None + + class TestWhenMetadataIsAvailable: + def test_it_returns_fleet_info(self): + test_fleet_info = AWSEC2Instance(host_name="testHost", host_type="testType") + subject = look_up_fleet_info( + platform_metadata_fetchers=(lambda: None, lambda: test_fleet_info) + ) + + assert subject == test_fleet_info + + class TestAgentMetadataInit: + class TestWhenFleetInfoIsNotAvailable: + def test_it_returns_default_agent_metadata(self): + subject = AgentMetadata() + + assert subject.agent_info == AgentInfo.default_agent_info() + assert subject.fleet_info is not None + assert subject.runtime_version[0] == "3" + + class TestAgentInfo: + class TestEqual: + def test_it_does_equality_correctly(self): + subject = AgentInfo(agent_type="testAgentType", version="2345") + same = AgentInfo(agent_type="testAgentType", version="2345") + + assert subject == same + assert subject.agent_type == same.agent_type + assert subject.version == same.version + + different = AgentInfo(agent_type="testDifferentAgentType", version="9876") + + assert subject != different + assert subject.agent_type != different.agent_type + assert subject.version != different.version + + assert subject != "abc" + + +class TestHttpGet: + class TestWhenUrlIsNotHttp: + def test_it_throws_value_error(self): + with pytest.raises(ValueError): + http_get(url='file://sensitive_file.txt') diff --git a/test/unit/agent_metadata/test_aws_ec2_instance.py b/test/unit/agent_metadata/test_aws_ec2_instance.py new file mode 100644 index 0000000..2b3d1db --- /dev/null +++ b/test/unit/agent_metadata/test_aws_ec2_instance.py @@ -0,0 +1,105 @@ +import pytest +import httpretty +import sys +from codeguru_profiler_agent.agent_metadata.aws_ec2_instance import EC2_HOST_NAME_URI, \ + EC2_HOST_INSTANCE_TYPE_URI +from codeguru_profiler_agent.agent_metadata.aws_ec2_instance import AWSEC2Instance + + +@pytest.mark.skipif(sys.version_info >= (3, 9), + reason="These tests require use of httpPretty that doesn't support yet Python 3.9.") +class TestAWSEC2Instance: + class TestLookUpMetadata: + class TestWhenCalledOutsideEc2: + @pytest.fixture(autouse=True) + def around(self): + httpretty.enable() + httpretty.HTTPretty.allow_net_connect = False + yield + httpretty.disable() + httpretty.reset() + + def test_it_returns_none(self): + """ + With httpretty, we have disabled all network connection; hence the call to get the ec2 metadata should + always fail. + """ + subject = AWSEC2Instance.look_up_metadata() + + assert subject is None + + class TestWhenHostNameAndHostTypeCanBeDetermined: + @pytest.fixture(autouse=True) + def around(self): + httpretty.enable() + httpretty.HTTPretty.allow_net_connect = False + httpretty.register_uri( + httpretty.GET, + EC2_HOST_NAME_URI, + body="testHostName") + httpretty.register_uri( + httpretty.GET, + EC2_HOST_INSTANCE_TYPE_URI, + body="testHostType") + yield + httpretty.disable() + httpretty.reset() + + def test_it_returns_host_name_and_host_type(self): + subject = AWSEC2Instance.look_up_metadata() + + assert subject.host_name == "testHostName" + assert subject.host_type == "testHostType" + assert subject.get_fleet_instance_id() == "testHostName" + + class TestWhenHostNameCannotBeDetermined: + @pytest.fixture(autouse=True) + def around(self): + httpretty.enable() + httpretty.HTTPretty.allow_net_connect = False + httpretty.register_uri( + httpretty.GET, + EC2_HOST_INSTANCE_TYPE_URI, + body="testHostType") + yield + httpretty.disable() + httpretty.reset() + + def test_it_returns_none(self): + subject = AWSEC2Instance.look_up_metadata() + + assert subject is None + + class TestWhenHostTypeCannotBeDetermined: + @pytest.fixture(autouse=True) + def around(self): + httpretty.enable() + httpretty.HTTPretty.allow_net_connect = False + httpretty.register_uri( + httpretty.GET, + EC2_HOST_NAME_URI, + body="testHostName") + yield + httpretty.disable() + httpretty.reset() + + def test_it_returns_none(self): + subject = AWSEC2Instance.look_up_metadata() + + assert subject is None + + class TestSerializeToMap: + def test_it_returns_a_map(self): + subject = AWSEC2Instance(host_name="testHostName", host_type="testHostType") + + assert subject.serialize_to_map() == { + "computeType": "aws_ec2_instance", + "hostName": "testHostName", + "hostType": "testHostType" + } + + class TestGetMetadataForConfigureAgentCall: + def test_it_returns_the_compute_platform(self): + subject = AWSEC2Instance(host_name="testHostName", host_type="testHostType") + + assert subject.get_metadata_for_configure_agent_call() == None diff --git a/test/unit/agent_metadata/test_aws_fargate_task.py b/test/unit/agent_metadata/test_aws_fargate_task.py new file mode 100644 index 0000000..3d94fd9 --- /dev/null +++ b/test/unit/agent_metadata/test_aws_fargate_task.py @@ -0,0 +1,135 @@ +import httpretty +import pytest +import sys +from codeguru_profiler_agent.agent_metadata.aws_fargate_task import AWSFargateTask + +DUMMY_URI_FOR_TEST = "http://dummy-ecs/meta-data" + + +@pytest.mark.skipif(sys.version_info >= (3, 9), + reason="These tests require use of httpPretty that doesn't support yet Python 3.9.") +class TestAWSFargateTask: + class TestLookUpMetadata: + """ + See https://docs.aws.amazon.com/AmazonECS/latest/developerguide/task-metadata-endpoint-v3.html for details of + the format returned by the ecs container metadata endpoint. + """ + + class TestWhenCalledOutsideFargate: + @pytest.fixture(autouse=True) + def around(self): + httpretty.enable() + httpretty.HTTPretty.allow_net_connect = False + yield + httpretty.disable() + httpretty.reset() + + def test_it_returns_none_when_url_is_not_set(self): + subject = AWSFargateTask.look_up_metadata() + assert subject is None + + def test_it_returns_none_when_url_is_none(self): + subject = AWSFargateTask.look_up_metadata(url=None) + assert subject is None + + def test_it_returns_none_when_call_fails(self): + """ + With httpretty, we have disabled all network connection; hence the call to get the fargate metadata + should always fail. + """ + subject = AWSFargateTask.look_up_metadata(url=DUMMY_URI_FOR_TEST) + + assert subject is None + + class TestWhenTaskArnAndLimitsCanBeDetermined: + @pytest.fixture(autouse=True) + def around(self): + httpretty.enable() + httpretty.HTTPretty.allow_net_connect = False + httpretty.register_uri( + httpretty.GET, + DUMMY_URI_FOR_TEST + "/task", + body='{"TaskARN": "testTaskARN", "Limits": {"CPU": 123, "Memory": 789}}') + yield + httpretty.disable() + httpretty.reset() + + def test_it_returns_host_name_and_host_type(self): + subject = AWSFargateTask.look_up_metadata(url=DUMMY_URI_FOR_TEST) + + assert subject.task_arn == "testTaskARN" + assert subject.cpu_limit == 123 + assert subject.memory_limit_in_mb == 789 + assert subject.get_fleet_instance_id() == "testTaskARN" + + class TestWhenTaskArnCannotBeDetermined: + @pytest.fixture(autouse=True) + def around(self): + httpretty.enable() + httpretty.HTTPretty.allow_net_connect = False + httpretty.register_uri( + httpretty.GET, + DUMMY_URI_FOR_TEST + "/task", + body='{"Limits": {"CPU": 123, "Memory": 789}}') + yield + httpretty.disable() + httpretty.reset() + + def test_it_returns_none(self): + subject = AWSFargateTask.look_up_metadata(url=DUMMY_URI_FOR_TEST) + + assert subject is None + + class TestWhenLimitsCannotBeDetermined: + class TestWhenCPULimitIsMissing: + @pytest.fixture(autouse=True) + def around(self): + httpretty.enable() + httpretty.HTTPretty.allow_net_connect = False + httpretty.register_uri( + httpretty.GET, + DUMMY_URI_FOR_TEST + "/task", + body='{"TaskARN": "testTaskARN", "Limits": {"Memory": 789}}') + yield + httpretty.disable() + httpretty.reset() + + def test_it_returns_none_for_cpu_limit(self): + subject = AWSFargateTask.look_up_metadata(url=DUMMY_URI_FOR_TEST) + + assert subject.cpu_limit is None + + class TestWhenMemoryLimitIsMissing: + @pytest.fixture(autouse=True) + def around(self): + httpretty.enable() + httpretty.HTTPretty.allow_net_connect = False + httpretty.register_uri( + httpretty.GET, + DUMMY_URI_FOR_TEST + "/task", + body='{"TaskARN": "testTaskARN", "Limits": {"CPU": 123}}') + yield + httpretty.disable() + httpretty.reset() + + def test_it_returns_none(self): + subject = AWSFargateTask.look_up_metadata(url=DUMMY_URI_FOR_TEST) + + assert subject.memory_limit_in_mb is None + + class TestSerializeToMap: + def test_it_returns_a_map(self): + subject = AWSFargateTask(task_arn="testTaskArn", cpu_limit=100, memory_limit_in_mb=200) + + assert subject.serialize_to_map() == { + "computeType": "aws_fargate_task", + "taskArn": "testTaskArn", + "cpuLimit": 100, + "memoryLimitInMB": 200 + } + + class TestGetMetadataForConfigureAgentCall: + def test_it_returns_the_compute_platform(self): + subject = AWSFargateTask(task_arn="testTaskArn", cpu_limit=100, memory_limit_in_mb=200) + + assert subject.get_metadata_for_configure_agent_call() == None diff --git a/test/unit/agent_metadata/test_aws_lambda.py b/test/unit/agent_metadata/test_aws_lambda.py new file mode 100644 index 0000000..99265f7 --- /dev/null +++ b/test/unit/agent_metadata/test_aws_lambda.py @@ -0,0 +1,138 @@ +import pytest +from test.pytestutils import before +from mock import Mock +from datetime import timedelta +from codeguru_profiler_agent.agent_metadata.aws_lambda import AWSLambda +from codeguru_profiler_agent.aws_lambda.lambda_context import LambdaContext + + +class TestAWSLambda: + class TestLookUpMetadata: + class TestWhenContextIsAvailable: + class TestWhenEnvIsAvailable: + @before + def before(self): + self.context = Mock() + self.context.invoked_function_arn = "the_lambda_function_arn" + self.env = {"AWS_EXECUTION_ENV": "AWS_Lambda_python3.6", "AWS_LAMBDA_FUNCTION_MEMORY_SIZE": "2048"} + + def test_it_finds_the_arn(self): + subject = AWSLambda.look_up_metadata(self.context, self.env) + assert subject.function_arn == "the_lambda_function_arn" + + def test_it_finds_memory_limit(self): + subject = AWSLambda.look_up_metadata(self.context, self.env) + assert subject.memory_limit_mb == 2048 + + def test_it_finds_the_execution_env(self): + subject = AWSLambda.look_up_metadata(self.context, self.env) + assert subject.execution_env == "AWS_Lambda_python3.6" + + def test_when_env_is_not_available_it_still_returns_at_least_the_arn(self): + subject = AWSLambda.look_up_metadata(self.context, {}) + assert subject.function_arn == "the_lambda_function_arn" + + def test_when_env_is_not_available_other_values_are_set_to_none(self): + subject = AWSLambda.look_up_metadata(self.context, {}) + assert subject.execution_env is None + assert subject.memory_limit_mb is None + + class TestWhenMemoryEnvIsNotValidInt: + @before + def before(self): + self.context = Mock() + self.context.invoked_function_arn = "the_lambda_function_arn" + self.env = {"AWS_EXECUTION_ENV": "AWS_Lambda_python3.6", + "AWS_LAMBDA_FUNCTION_MEMORY_SIZE": "not_a_valid_integer"} + + def test_it_still_returns_the_arn(self): + subject = AWSLambda.look_up_metadata(self.context, {}) + assert subject.function_arn == "the_lambda_function_arn" + + def test_it_finds_the_execution_env(self): + subject = AWSLambda.look_up_metadata(self.context, self.env) + assert subject.execution_env == "AWS_Lambda_python3.6" + + def test_memory_limit_is_set_to_none(self): + subject = AWSLambda.look_up_metadata(self.context, self.env) + assert subject.memory_limit_mb is None + + class TestWhenContextIsNotAvailable: + def test_it_returns_none(self): + subject = AWSLambda.look_up_metadata(context=None, env={}) + assert subject is None + + class TestSerializeToMap: + def test_it_returns_a_map(self): + subject = AWSLambda(function_arn="the_arn", memory_limit_mb=512, execution_env="AWS_Lambda_python3.8") + + assert subject.serialize_to_map() == { + "computeType": "aws_lambda", + "functionArn": "the_arn", + "memoryLimitInMB": 512, + "executionEnv": "AWS_Lambda_python3.8" + } + + def test_it_ignores_memory_limit_if_none(self): + subject = AWSLambda(function_arn="the_arn", memory_limit_mb=None, execution_env="AWS_Lambda_python3.8") + + assert subject.serialize_to_map() == { + "computeType": "aws_lambda", + "functionArn": "the_arn", + "executionEnv": "AWS_Lambda_python3.8" + } + + def test_it_ignores_execution_env_if_none(self): + subject = AWSLambda(function_arn="the_arn", memory_limit_mb=512, execution_env=None) + + assert subject.serialize_to_map() == { + "computeType": "aws_lambda", + "functionArn": "the_arn", + "memoryLimitInMB": 512 + } + + class TestGetMetadataForConfigureAgentCall: + @before + def before(self): + self.context = Mock() + self.context.invoked_function_arn = "the_lambda_function_arn" + self.context.aws_request_id = "the_aws_request_id" + self.context.get_remaining_time_in_millis = Mock(return_value=30125) + self.subject = AWSLambda(function_arn="the_lambda_function_arn", memory_limit_mb=512, + execution_env="AWS_Lambda_python3.8", + agent_id="a509707f-12db-462d-b5c9-18bc19c69bf0") + self.lambda_context = LambdaContext() + self.lambda_context.context = self.context + self.lambda_context.last_execution_duration = timedelta(seconds=1, milliseconds=234, microseconds=987) + + def test_it_returns_the_metadata_needed_for_configure_agent_call(self): + assert self.subject.get_metadata_for_configure_agent_call(lambda_context=self.lambda_context) == { + "ComputePlatform": "AWSLambda", + "AgentId": "a509707f-12db-462d-b5c9-18bc19c69bf0", + "AwsRequestId": "the_aws_request_id", + "ExecutionEnvironment": "AWS_Lambda_python3.8", + "LambdaFunctionArn": "the_lambda_function_arn", + "LambdaMemoryLimitInMB": "512", + "LambdaRemainingTimeInMilliseconds": "30125", + "LambdaPreviousExecutionTimeInMilliseconds": "1234" + } + + class TestWhenSomeFieldsAreMissing: + @pytest.fixture(autouse=True) + def around(self): + self.subject = AWSLambda(function_arn="the_lambda_function_arn", + memory_limit_mb=None, + execution_env=None, + agent_id="a509707f-12db-462d-b5c9-18bc19c69bf0") + LambdaContext.get() + yield + LambdaContext._singleton = None + + def test_it_returns_at_least_the_compute_platform(self): + metadata = self.subject.get_metadata_for_configure_agent_call() + assert metadata["ComputePlatform"] == "AWSLambda" + + def test_it_can_return_last_execution_duration_independently(self): + LambdaContext.get().last_execution_duration = timedelta(seconds=1, milliseconds=234, microseconds=987) + metadata = self.subject.get_metadata_for_configure_agent_call() + assert metadata["LambdaPreviousExecutionTimeInMilliseconds"] == "1234" diff --git a/test/unit/aws_lambda/test_lambda_handler.py b/test/unit/aws_lambda/test_lambda_handler.py new file mode 100644 index 0000000..b925d21 --- /dev/null +++ b/test/unit/aws_lambda/test_lambda_handler.py @@ -0,0 +1,100 @@ +from test.pytestutils import before +import sys +import os +import pytest + + +def handler_function(event, context): + if event == "expected_event" and context == "expected_context": + return "expected result" + return "wrong result" + + +init_handler_has_been_called = False + + +def init_handler(): + global init_handler_has_been_called + init_handler_has_been_called = True + + +class BootstrapModuleMock: + def _get_handler(self, handler): + if handler == "handler_module.handler_function": + return handler_function + + +class BootstrapPython36ModuleMock: + # for python3.6 version of lambda runtime bootstrap + def _get_handlers(self, handler, mode, invokeid): + if handler == "handler_module.handler_function" and mode == "event": + return init_handler, handler_function + + +class TestLambdaHandler: + class TestWhenLambdaHandlerModuleIsLoaded: + @pytest.fixture(autouse=True) + def around(self): + # simulate that we are in a lambda environment where the bootstrap module is available + os.environ['HANDLER_ENV_NAME_FOR_CODEGURU'] = 'handler_module.handler_function' + sys.modules['bootstrap'] = BootstrapModuleMock() + yield + del sys.modules['bootstrap'] + del os.environ['HANDLER_ENV_NAME_FOR_CODEGURU'] + if '_HANDLER' in os.environ: + del os.environ['_HANDLER'] + + def test_it_loads_the_handler_function(self): + import codeguru_profiler_agent.aws_lambda.lambda_handler as lambda_handler_module + assert lambda_handler_module.handler_function == handler_function + + def test_call_handler_calls_the_inner_handler(self): + import codeguru_profiler_agent.aws_lambda.lambda_handler as lambda_handler_module + assert lambda_handler_module.call_handler(event="expected_event", + context="expected_context") == "expected result" + + class TestLoadModuleFunction: + class TestWhenPython38LambdaBootstrapCalls: + class TestWhenHandlerEnvIsSetProperly: + @before + def before(self): + self.bootstrap = BootstrapModuleMock() + self.env = {"HANDLER_ENV_NAME_FOR_CODEGURU": "handler_module.handler_function"} + + def test_it_returns_the_handler_function(self): + from codeguru_profiler_agent.aws_lambda.lambda_handler import load_handler + assert load_handler(self.bootstrap, self.env) == handler_function + + def test_it_resets_handler_env_variable(self): + from codeguru_profiler_agent.aws_lambda.lambda_handler import load_handler + load_handler(self.bootstrap, self.env) + assert self.env['_HANDLER'] == "handler_module.handler_function" + + class TestWhenHandlerEnvIsMissing: + @before + def before(self): + self.bootstrap = BootstrapModuleMock() + self.env = {} + + def test_it_throws_value_error(self): + with pytest.raises(ValueError): + from codeguru_profiler_agent.aws_lambda.lambda_handler import load_handler + load_handler(self.bootstrap, self.env) + + class TestWhenPython36LambdaBootstrapCalls: + class TestWhenHandlerEnvIsSetProperly: + @before + def before(self): + self.bootstrap = BootstrapPython36ModuleMock() + self.env = {"HANDLER_ENV_NAME_FOR_CODEGURU": "handler_module.handler_function"} + global init_handler_has_been_called + init_handler_has_been_called = False + + def test_it_returns_the_handler_function(self): + from codeguru_profiler_agent.aws_lambda.lambda_handler import load_handler + assert load_handler(self.bootstrap, self.env) == handler_function + + def test_it_calls_the_init_handler(self): + from codeguru_profiler_agent.aws_lambda.lambda_handler import load_handler + load_handler(self.bootstrap, self.env) + assert init_handler_has_been_called diff --git a/test/unit/aws_lambda/test_profiler_decorator.py b/test/unit/aws_lambda/test_profiler_decorator.py new file mode 100644 index 0000000..32c47a5 --- /dev/null +++ b/test/unit/aws_lambda/test_profiler_decorator.py @@ -0,0 +1,112 @@ +import pytest +import codeguru_profiler_agent.aws_lambda.profiler_decorator + +from mock import MagicMock, patch +from codeguru_profiler_agent.reporter.agent_configuration import AgentConfiguration +from codeguru_profiler_agent import with_lambda_profiler +from codeguru_profiler_agent import Profiler +from codeguru_profiler_agent.aws_lambda.lambda_context import LambdaContext + + +class TestWithLambdaProfiler: + @pytest.fixture(autouse=True) + def around(self): + self.mock_profiler = MagicMock(name="profiler", spec=Profiler) + self.context = MagicMock() + self.context.invoked_function_arn = "the_lambda_function_arn" + + @with_lambda_profiler(profiler_factory=lambda *args, **kwargs: self.mock_profiler) + def handler_function(event, context): + self.counter += 1 + if event.get('throw'): + raise ValueError( + "Simulated error in a lambda handler (unit test)") + + self.counter = 0 + self.handler = handler_function + self.mock_profiler.start.return_value = True + + yield \ + codeguru_profiler_agent.aws_lambda.profiler_decorator.clear_static_profiler() + if Profiler._active_profiler is not None: + Profiler._active_profiler.stop() + + def test_function_is_called(self): + self.handler({}, self.context) + assert (self.counter == 1) + + def test_profiler_is_started_and_paused(self): + self.handler({}, self.context) + self.mock_profiler.start.assert_called_once() + self.mock_profiler.pause.assert_called_once() + + def test_profiler_is_paused_when_handler_fails(self): + try: + self.handler({'throw': True}, self.context) + except ValueError: + pass + self.mock_profiler.pause.assert_called_once() + + def test_profiler_is_disabled_if_start_fails(self): + self.mock_profiler.start.return_value = False + self.handler({}, self.context) + # now call again and check that profiler is disabled + self.handler({}, self.context) + assert isinstance(codeguru_profiler_agent.aws_lambda.profiler_decorator._profiler, + type(codeguru_profiler_agent.aws_lambda.profiler_decorator._EmptyProfiler())) + + def test_function_runs_even_when_profiler_is_disabled(self): + self.mock_profiler.start.return_value = False + self.handler({}, self.context) + # make sure function is still called + assert (self.counter == 1) + # now call again, function is still called. + self.handler({}, self.context) + assert (self.counter == 2) + + +class TestWithParameters: + @pytest.fixture(autouse=True) + def around(self): + self.context = MagicMock() + self.context.invoked_function_arn = "the_lambda_function_arn" + self.env = {"AWS_LAMBDA_FUNCTION_MEMORY_SIZE": "1024", + "AWS_EXECUTION_ENV": "AWS_Lambda_python3.6"} + + # define a handler function with the profiler decorator and parameters + @with_lambda_profiler(profiling_group_name="pg_name", region_name="eu-north-1", + environment_override={'cpu_limit_percentage': 42}, env=self.env) + def handler_function(event, context): + return True + + self.handler = handler_function + yield \ + codeguru_profiler_agent.aws_lambda.profiler_decorator.clear_static_profiler() + + def test_given_profiling_group_is_used(self): + self.handler({}, self.context) + assert (codeguru_profiler_agent.aws_lambda.profiler_decorator._profiler._profiler_runner.collector + .profiling_group_name == "pg_name") + + def test_given_region_name_is_used(self): + self.handler({}, self.context) + assert ('eu-north-1' in codeguru_profiler_agent.aws_lambda.profiler_decorator._profiler._profiler_runner. + collector.reporter.codeguru_client_builder.codeguru_client._endpoint.host) + + def test_given_override_is_used(self): + self.handler({}, self.context) + assert AgentConfiguration.get().cpu_limit_percentage == 42 + + def test_metadata_is_properly_set(self): + self.handler({}, self.context) + fleet_info = codeguru_profiler_agent.aws_lambda.profiler_decorator._profiler. \ + _profiler_runner.collector.reporter.metadata.fleet_info + assert (fleet_info.get_fleet_instance_id() == "the_lambda_function_arn") + + def test_context_is_set_in_lambda_context_singleton(self): + self.handler({}, self.context) + assert LambdaContext.get().context is self.context + + def test_last_call_duration_is_set_in_lambda_context_singleton(self): + self.handler({}, self.context) + assert LambdaContext.get().last_execution_duration diff --git a/test/unit/file_reporter/test_file_reporter.py b/test/unit/file_reporter/test_file_reporter.py new file mode 100644 index 0000000..a1266b2 --- /dev/null +++ b/test/unit/file_reporter/test_file_reporter.py @@ -0,0 +1,57 @@ +import tempfile +import pytest +import shutil + +from mock import MagicMock +from mock import ANY +from pathlib import Path + +from codeguru_profiler_agent.file_reporter.file_reporter import FileReporter +from codeguru_profiler_agent.sdk_reporter.profile_encoder import ProfileEncoder +from test.help_utils import FILE_PREFIX + +class TestFileReporter: + class TestReport: + @pytest.fixture(autouse=True) + def around(self): + temporary_directory = tempfile.mkdtemp() + self.file_prefix = str( + Path(temporary_directory, FILE_PREFIX)) + + self.profile_encoder = MagicMock( + name="profile_encoder", spec=ProfileEncoder) + self.profile = MagicMock(name="profile") + + self.subject = FileReporter( + environment={ + "file_prefix": self.file_prefix, + "profile_encoder": self.profile_encoder + }) + + yield + + shutil.rmtree(temporary_directory) + + def report(self): + return self.subject.report(profile=self.profile) + + def test_it_uses_the_file_prefix_to_create_a_file_name_for_the_profile( + self): + assert (self.report().startswith(self.file_prefix)) + + def test_it_adds_the_json_file_extension(self): + assert (self.report().endswith(".json")) + + def test_it_calls_the_profile_encoder_with_the_profile(self): + self.report() + + self.profile_encoder.encode.assert_called_once_with( + profile=self.profile, output_stream=ANY) + + def test_it_writes_the_encoded_profile_to_a_file(self): + self.profile_encoder.encode.side_effect = lambda **args: args["output_stream"].write(b"output-from-encoder") + + output_file = self.report() + + with open(output_file) as output_file_content: + assert (output_file_content.read() == "output-from-encoder") diff --git a/test/unit/metrics/test_metric.py b/test/unit/metrics/test_metric.py new file mode 100644 index 0000000..78ee2ad --- /dev/null +++ b/test/unit/metrics/test_metric.py @@ -0,0 +1,40 @@ +from test.pytestutils import before + +from codeguru_profiler_agent.metrics.metric import Metric + + +class TestAdd: + @before + def before(self): + self.subject = Metric() + self.subject.add(10) + + def test_value_less_than_max(self): + self.subject.add(5) + + assert (self.subject.counter == 2) + assert (self.subject.max == 10) + assert (self.subject.total == 15) + + def test_value_greater_than_max(self): + self.subject.add(15) + + assert (self.subject.counter == 2) + assert (self.subject.max == 15) + assert (self.subject.total == 25) + + +class TestAverage: + @before + def before(self): + self.subject = Metric() + + def test_get_average_when_counter_is_zero(self): + assert (self.subject.average() == 0) + + def test_get_average_when_counter_is_not_zero(self): + self.subject.add(5) + self.subject.add(10) + self.subject.add(15) + + assert (self.subject.average() == 10) diff --git a/test/unit/metrics/test_timer.py b/test/unit/metrics/test_timer.py new file mode 100644 index 0000000..3a82f94 --- /dev/null +++ b/test/unit/metrics/test_timer.py @@ -0,0 +1,32 @@ +from test.pytestutils import before + +from codeguru_profiler_agent.metrics.timer import Timer + + +class TestTimer: + class TestRecord: + @before + def before(self): + self.subject = Timer() + self.subject.record("test-record", 10) + + def test_new_record_comes_in(self): + self.subject.record("new-record", 12) + + assert (self.subject.metrics["new-record"].total == 12) + assert (self.subject.metrics["new-record"].counter == 1) + + def test_update_old_record(self): + self.subject.record("test-record", 20) + + assert (self.subject.metrics["test-record"].total == 30) + assert (self.subject.metrics["test-record"].counter == 2) + + class TestReset: + def test_metrics_get_reset(self): + subject = Timer() + subject.record("test-record", 10) + + subject.reset() + + assert (not "test-record" in subject.metrics) diff --git a/test/unit/metrics/test_with_timer.py b/test/unit/metrics/test_with_timer.py new file mode 100644 index 0000000..29c830e --- /dev/null +++ b/test/unit/metrics/test_with_timer.py @@ -0,0 +1,41 @@ +from test.pytestutils import before + +from codeguru_profiler_agent.metrics.with_timer import with_timer +from codeguru_profiler_agent.metrics.timer import Timer + + +class TargetClass: + def __init__(self): + self.timer = Timer() + + @with_timer(metric_name="test-foo-wall", measurement="wall-clock-time") + def foo_wall(self): + return + + @with_timer(metric_name="test-foo-cpu", measurement="cpu-time") + def foo_cpu(self): + # Run something to make sure the cpu clock does tick (https://bugs.python.org/issue37859) + len(str(2 ** 500_000)) + return + + +class TestWithTimer: + @before + def before(self): + self.test_class = TargetClass() + + def test_it_times_wall_time(self): + self.test_class.foo_wall() + + assert (self.test_class.timer.metrics["test-foo-wall"].counter == 1) + assert (self.test_class.timer.metrics["test-foo-wall"].max > 0) + assert (self.test_class.timer.metrics["test-foo-wall"].total == + self.test_class.timer.metrics["test-foo-wall"].max) + + def test_it_times_cpu_time(self): + self.test_class.foo_cpu() + + assert (self.test_class.timer.metrics["test-foo-cpu"].counter == 1) + assert (self.test_class.timer.metrics["test-foo-cpu"].max > 0) + assert (self.test_class.timer.metrics["test-foo-cpu"].total == + self.test_class.timer.metrics["test-foo-cpu"].max) diff --git a/test/unit/model/test_call_graph_node.py b/test/unit/model/test_call_graph_node.py new file mode 100644 index 0000000..bcc18a6 --- /dev/null +++ b/test/unit/model/test_call_graph_node.py @@ -0,0 +1,209 @@ +import pytest + +from codeguru_profiler_agent.model.frame import Frame +from test.pytestutils import before +from mock import MagicMock + +from codeguru_profiler_agent.model.call_graph_node import CallGraphNode +from codeguru_profiler_agent.model.memory_counter import MemoryCounter + + +class TestCallGraphNodeInit: + def test_memory_count_called(self): + mock_memory_counter = _mock_memory_counter() + CallGraphNode("foo", class_name=None, file_path=None, line_no=None, memory_counter=mock_memory_counter) + + mock_memory_counter.count_create_node.assert_called_once() + mock_memory_counter.count_first_child.assert_not_called() + mock_memory_counter.count_add_child.assert_not_called() + + +class TestCallGraphNode: + def before(self): + self.subject = CallGraphNode("dummy_frame", None, file_path="file_path/file.py", line_no=123) + + +class TestUpdateCurrentNodeAndGetChild(TestCallGraphNode): + @before + def before(self): + super().before() + + def test_when_child_node_with_frame_does_not_exist_it_adds_a_new_child_node(self): + new_child_node = self.subject.update_current_node_and_get_child(Frame("new_child_frame")) + + assert (new_child_node in self.subject.children) + + def test_when_child_node_with_frame_does_not_exist__it_returns_a_new_child_node_for_the_given_frame( + self): + new_child_node = self.subject.update_current_node_and_get_child( + Frame("new_child_frame", class_name="TestClass", file_path="file_path/file.py", line_no=123)) + + assert (new_child_node.frame_name == "new_child_frame") + assert (new_child_node.class_name == "TestClass") + assert (new_child_node.file_path == "file_path/file.py") + assert (new_child_node.start_line == 123) + assert (new_child_node.end_line == 123) + assert (new_child_node.runnable_count == 0) + + def test_when_class_does_not_exist_it_returns_a_new_child_node_with_None_class_name(self): + new_child_node = self.subject.update_current_node_and_get_child( + Frame("new_child_frame", class_name=None)) + + assert (new_child_node.class_name is None) + + def test_when_file_path_does_not_exist_it_returns_a_new_child_node_with_None_file_path(self): + new_child_node = self.subject.update_current_node_and_get_child( + Frame("new_child_frame", file_path=None)) + + assert (new_child_node.file_path is None) + + def test_when_line_no_does_not_exist_it_returns_a_new_child_node_with_None_line_no(self): + new_child_node = self.subject.update_current_node_and_get_child( + Frame("new_child_frame", line_no=None)) + + assert (new_child_node.start_line is None) + assert (new_child_node.end_line is None) + + def test_when_class_name_is_different_it_returns_a_new_child_node(self): + existing_node = self.subject.update_current_node_and_get_child( + Frame("new_child_frame", class_name="TestClassA")) + new_node = self.subject.update_current_node_and_get_child( + Frame("new_child_frame", class_name="TestClassB")) + + assert (existing_node is not new_node) + + def test_when_file_path_is_different_it_returns_a_new_child_node(self): + existing_node = self.subject.update_current_node_and_get_child( + Frame("new_child_frame", file_path="file_path/file1.py")) + new_node = self.subject.update_current_node_and_get_child( + Frame("new_child_frame", class_name="file_path/file2.py")) + + assert (existing_node is not new_node) + + def test_when_frame_name_is_different_it_returns_a_new_child_node(self): + existing_node = self.subject.update_current_node_and_get_child( + Frame("new_child_frame_1")) + new_node = self.subject.update_current_node_and_get_child( + Frame("new_child_frame_2")) + + assert (existing_node is not new_node) + + def test_when_child_node_with_frame_already_existed_it_does_not_add_a_new_one(self): + self.subject.update_current_node_and_get_child(Frame("new_child_frame")) + + self.subject.update_current_node_and_get_child(Frame("new_child_frame")) + + assert (len(self.subject.children) == 1) + + def test_when_child_node_with_frame_already_existed_it_returns_the_existing_child_node( + self): + new_child_node = self.subject.update_current_node_and_get_child(Frame("new_child_frame")) + + assert (self.subject.update_current_node_and_get_child(Frame("new_child_frame")) is new_child_node) + + def test_when_line_no_needs_to_be_updated_it_takes_the_smallest_start_line_no(self): + self.subject.update_current_node_and_get_child( + Frame("new_child_frame", file_path="file_path/file.py", line_no=123)) + + node = self.subject.update_current_node_and_get_child( + Frame("new_child_frame", file_path="file_path/file.py", line_no=100)) + + assert (node.start_line == 100) + assert (node.end_line == 123) + + def test_when_line_no_needs_to_be_updated_it_takes_the_largest_end_line_no(self): + self.subject.update_current_node_and_get_child( + Frame("new_child_frame", file_path="file_path/file.py", line_no=200)) + + node = self.subject.update_current_node_and_get_child( + Frame("new_child_frame", file_path="file_path/file.py", line_no=234)) + + assert (node.start_line == 200) + assert (node.end_line == 234) + + def test_when_no_lines_existed_before_it_updates_line_no_range(self): + self.subject.update_current_node_and_get_child( + Frame("new_child_frame", file_path="file_path/file.py", line_no=None)) + + node = self.subject.update_current_node_and_get_child( + Frame("new_child_frame", file_path="file_path/file.py", line_no=200)) + + assert (node.start_line == 200) + assert (node.end_line == 200) + + def test_when_lines_no_not_available_it_does_not_update_line_no(self): + self.subject.update_current_node_and_get_child( + Frame("new_child_frame", file_path="file_path/file.py", line_no=None)) + + node = self.subject.update_current_node_and_get_child( + Frame("new_child_frame", file_path="file_path/file.py", line_no=None)) + + assert (node.start_line is None) + assert (node.end_line is None) + + def test_when_line_no_existed_before_it_does_not_update_line_no(self): + self.subject.update_current_node_and_get_child( + Frame("new_child_frame", file_path="file_path/file.py", line_no=100)) + + node = self.subject.update_current_node_and_get_child( + Frame("new_child_frame", file_path="file_path/file.py", line_no=None)) + + assert (node.start_line == 100) + assert (node.end_line == 100) + +class TestInteractionWithMemoryCounter: + def test_insert_first_child_calls_memory_counter(self): + mock_memory_counter = _mock_memory_counter() + call_graph_node = CallGraphNode("foo", class_name=None, file_path=None, line_no=None, + memory_counter=mock_memory_counter) + + call_graph_node.update_current_node_and_get_child(Frame("new_child_frame")) + + mock_memory_counter.count_first_child.assert_called_once() + mock_memory_counter.count_add_child.assert_not_called() + + def test_insert_already_exist_child_does_not_call_memory_counter(self): + mock_memory_counter = _mock_memory_counter() + call_graph_node = CallGraphNode("foo", class_name=None, file_path=None, line_no=None, + memory_counter=mock_memory_counter) + call_graph_node.update_current_node_and_get_child(Frame("new_child_frame")) + mock_memory_counter.reset_mock() + + call_graph_node.update_current_node_and_get_child(Frame("new_child_frame")) + + mock_memory_counter.assert_not_called() + + def test_insert_extra_child_calls_memory_counter(self): + mock_memory_counter = _mock_memory_counter() + call_graph_node = CallGraphNode("foo", class_name=None, file_path=None, line_no=None, + memory_counter=mock_memory_counter) + call_graph_node.update_current_node_and_get_child(Frame("new_child_frame_1")) + mock_memory_counter.reset_mock() + + call_graph_node.update_current_node_and_get_child(Frame("new_child_frame_2")) + + mock_memory_counter.count_first_child.assert_not_called() + mock_memory_counter.count_add_child.assert_called_once() + + +class TestIncreaseRunnableCount: + def test_it_increases_the_runnable_count_by_one(self): + subject = CallGraphNode("dummy_frame", class_name=None, file_path=None, line_no=None) + subject.increase_runnable_count() + + assert (subject.runnable_count == 1) + + def test_when_a_custom_value_to_add_is_used_it_increases_the_runnable_count_by_the_value_to_add(self): + subject = CallGraphNode("dummy_frame", class_name=None, file_path=None, line_no=None) + subject.increase_runnable_count(value_to_add=2) + + assert (subject.runnable_count == 2) + + def test_when_a_custom_value_to_add_is_used_it_raises_a_value_error(self): + subject = CallGraphNode("dummy_frame", class_name=None, file_path=None, line_no=None) + with pytest.raises(ValueError): + subject.increase_runnable_count(value_to_add=-1) + + +def _mock_memory_counter(): + return MagicMock(name="memory_counter", spec=MemoryCounter) diff --git a/test/unit/model/test_memory_counter.py b/test/unit/model/test_memory_counter.py new file mode 100644 index 0000000..c842c6c --- /dev/null +++ b/test/unit/model/test_memory_counter.py @@ -0,0 +1,73 @@ +import sys + +from pympler import asizeof + +from codeguru_profiler_agent.model.call_graph_node import CallGraphNode +from codeguru_profiler_agent.model.frame import Frame +from codeguru_profiler_agent.model.memory_counter import MemoryCounter + + +# These tests validate that our simple memory tracker's math matches with what we can observe in the Pympler memory +# sizing library +class TestMemoryCounter: + class TestEmptyNodeSize: + def test_it_matches_the_expected_size(self): + dummy_frame_name = None + + node = CallGraphNode(frame_name=dummy_frame_name, class_name=None, file_path=None, line_no=None) + node.increase_runnable_count() + + full_recursive_size_of_node = asizeof.asizeof(node) + + frame_name_size = sys.getsizeof(dummy_frame_name) + empty_children_tuple_size = sys.getsizeof(()) + + assert (MemoryCounter.empty_node_size_bytes == \ + (full_recursive_size_of_node \ + # The empty size should not include the frame name, so we subtract it + - frame_name_size + # The empty tuple is always reused by Python, so we also subtract it + - empty_children_tuple_size)) + + def test_sanity_check_it_is_smaller_than_512_bytes(self): + assert (MemoryCounter.empty_node_size_bytes <= 256) + + class TestBaseStorageSize: + def test_it_matches_the_expected_size(self): + node = CallGraphNode(frame_name=None, class_name=None, file_path=None, line_no=None) + node.update_current_node_and_get_child(frame=Frame(None)) + + recursive_node_storage_size = asizeof.asizeof(node.children) + child_node_size = asizeof.asizeof( + CallGraphNode(frame_name=None, class_name=None, file_path=None, line_no=None)) + + assert (MemoryCounter.base_storage_size_bytes == ( + recursive_node_storage_size - child_node_size)) + + def test_sanity_check_it_is_smaller_than_100_bytes(self): + assert (MemoryCounter.base_storage_size_bytes <= 100) + + class TestStorageIncrementSize: + def test_it_matches_the_expected_size(self): + node = CallGraphNode(frame_name=None, class_name=None, file_path=None, line_no=None) + + node.update_current_node_and_get_child(Frame("child1")) + one_child_storage_size = asizeof.asizeof(node.children, limit=0) + + node.update_current_node_and_get_child(Frame("child2")) + two_children_storage_size = asizeof.asizeof(node.children, limit=0) + + assert (MemoryCounter.storage_increment_size_bytes == ( + two_children_storage_size - one_child_storage_size)) + + def test_sanity_check_it_is_smaller_than_16_bytes(self): + assert (MemoryCounter.storage_increment_size_bytes <= 16) + + class TestCountCreateNode: + def test_sanity_check_it_counts_frame_file_path_line_no_class_name_size(self): + subject = MemoryCounter() + subject.count_create_node(frame="test/frame", file_path="test/file/path", class_name="TestClass") + # [Oct-2020 Python-3.7.7] "test/frame" size: 59 bytes; "test/file/path" size: 63 bytes; "TestClass" size: + # 58 bytes; fixed line_no size: 2 * 32 = 64; sum = 244 + expected_size = subject.empty_node_size_bytes + 244 + assert (subject.get_memory_usage_bytes() == expected_size) diff --git a/test/unit/model/test_profile.py b/test/unit/model/test_profile.py new file mode 100644 index 0000000..f197f4d --- /dev/null +++ b/test/unit/model/test_profile.py @@ -0,0 +1,488 @@ +import pytest +from mock import Mock + +from codeguru_profiler_agent.model.frame import Frame +from test.pytestutils import before +import datetime + +from codeguru_profiler_agent.model.profile import Profile +from codeguru_profiler_agent.model.sample import Sample + + +class TestProfile: + def before(self): + self.test_start_time = 1603884061556 + self.mock_clock = Mock() + self.mock_clock.return_value = self.test_start_time / 1000 + self.subject = Profile( + profiling_group_name="foo", + sampling_interval_seconds=1.0, + host_weight=2, + start=self.test_start_time, + clock=self.mock_clock + ) + + def turn_clock(seconds): + self.mock_clock.return_value += seconds + + self.turn_clock = turn_clock + + +class TestAdd(TestProfile): + @before + def before(self): + super().before() + + @pytest.mark.parametrize("stacks, expected", [ + ([[Frame("method_one"), Frame("method_two"), Frame("method_three")]], { + "count": 0, + "children": { + "method_one": { + "count": 0, + "children": { + "method_two": { + "count": 0, + "children": { + "method_three": { + "count": 1, + "children": {} + } + } + } + } + } + } + }), ([[Frame("method_one"), Frame("method_two"), Frame("method_three"), Frame("method_five")], + [Frame("method_one"), Frame("method_two"), Frame("method_four"), Frame("method_five")], + [Frame("method_one"), Frame("method_two"), Frame("method_three"), Frame("method_six")], + [Frame("method_one"), Frame("method_seven"), Frame("method_three")]], { + "count": 0, + "children": { + "method_one": { + "count": 0, + "children": { + "method_two": { + "count": 0, + "children": { + "method_three": { + "count": 0, + "children": { + "method_five": { + "count": 1, + "children": {} + }, + "method_six": { + "count": 1, + "children": {} + } + } + }, + "method_four": { + "count": 0, + "children": { + "method_five": { + "count": 1, + "children": {} + } + } + } + } + }, + "method_seven": { + "count": 0, + "children": { + "method_three": { + "count": 1, + "children": {} + } + } + } + } + } + } + }), + ([[Frame("method_one"), Frame("method_two"), Frame("method_three"), Frame("method_four")], + [Frame("method_one"), Frame("method_two"), Frame("method_three"), Frame("method_four")]], { + "count": 0, + "children": { + "method_one": { + "count": 0, + "children": { + "method_two": { + "count": 0, + "children": { + "method_three": { + "count": 0, + "children": { + "method_four": { + "count": 2, + "children": {} + } + } + } + } + } + } + } + } + }), + ([[Frame("method_one"), Frame("method_two"), Frame("method_three")], + [Frame("method_one"), Frame("method_two")]], { + "count": 0, + "children": { + "method_one": { + "count": 0, + "children": { + "method_two": { + "count": 1, + "children": { + "method_three": { + "count": 1, + "children": {} + } + } + } + } + } + } + }), + ([[Frame("method_one", file_path="path/file1.py"), Frame("method_two", file_path="path/file2.py"), + Frame("method_three", file_path="path/file3.py")], + [Frame("method_one", file_path="path/file1.py"), Frame("method_two", file_path="path/file2.py")]], { + "count": 0, + "children": { + "method_one": { + "count": 0, + "children": { + "method_two": { + "count": 1, + "children": { + "method_three": { + "count": 1, + "children": {}, + "file": "path/file3.py" + } + }, + "file": "path/file2.py" + } + }, + "file": "path/file1.py" + } + } + }), + # The following test case is for testing that line_no is correctly written to the profile; it covers + # when line_no points to a specific line or to a range of lines represented in a list + ([[Frame("method_one", file_path="path/file1.py", line_no=1), + Frame("method_two", file_path="path/file2.py", line_no=10), + Frame("method_three", file_path="path/file3.py", line_no=100)], + [Frame("method_one", file_path="path/file1.py", line_no=1), + Frame("method_two", file_path="path/file2.py", line_no=20), + Frame("method_three", file_path="path/file3.py", line_no=90)]], { + "count": 0, + "children": { + "method_one": { + "count": 0, + "children": { + "method_two": { + "count": 0, + "children": { + "method_three": { + "count": 2, + "children": {}, + "file": "path/file3.py", + "line": [90, 100] + } + }, + "file": "path/file2.py", + "line": [10, 20] + } + }, + "file": "path/file1.py", + "line": [1] + } + } + }), + # The following test case is for testing that class_name is correctly written to the profile; it covers + # the cases when frame name/ file_path are the same when the class_name is different; frame_name/ file_path + # are different when the class_name is the same + ([[Frame("method_one", file_path="path/file1.py", class_name="ClassA"), + Frame("method_two", file_path="path/file2.py", class_name="ClassB")], + [Frame("method_one", file_path="path/file1.py", class_name="ClassA"), + Frame("method_two", file_path="path/file2.py", class_name="AnotherClassB")], + [Frame("another_method_one", file_path="path/file1.py", class_name="ClassA")], + [Frame("method_one", file_path="path/another_file1.py", class_name="ClassA")]], { + "count": 0, + "children": { + "method_one": { + "count": 0, + "children": { + "method_two": { + "count": 1, + "children": {}, + "file": "path/file2.py", + "class_name": "ClassB" + }, + "method_two": { + "count": 1, + "children": {}, + "file": "path/file2.py", + "class_name": "AnotherClassB" + } + }, + "file": "path/file1.py", + "class_name": "ClassA" + }, + "another_method_one": { + "count": 1, + "children": {}, + "file": "path/file1.py", + "class_name": "ClassA" + }, + "method_one": { + "count": 1, + "children": {}, + "file": "path/another_file1.py", + "class_name": "ClassA" + } + } + }) + ]) + def test_add_stack(self, stacks, expected): + sample = Sample(stacks=stacks) + + self.subject.add(sample) + + assert (_convert_profile_into_dict(self.subject) == expected) + + def test_it_keeps_the_total_sum_of_the_attempted_sample_threads_count_values( + self): + sample1 = Sample(stacks=[[Frame("frame1")]], attempted_sample_threads_count=12) + sample2 = Sample(stacks=[[Frame("frame1")]], attempted_sample_threads_count=34) + + self.subject.add(sample1) + self.subject.add(sample2) + + assert (self.subject.total_attempted_sample_threads_count == ( + 12 + 34)) + + def test_it_keeps_the_total_sum_of_the_seen_threads_count_values(self): + sample1 = Sample(stacks=[[Frame("frame1")]], seen_threads_count=56) + sample2 = Sample(stacks=[[Frame("frame1")]], seen_threads_count=78) + + self.subject.add(sample1) + self.subject.add(sample2) + + assert (self.subject.total_seen_threads_count == (56 + 78)) + + +class TestStartTime(TestProfile): + @before + def before(self): + super().before() + + def test_it_stores_valid_start_time(self): + assert self.subject.start == self.test_start_time + + def test_it_raise_exception_for_invalid_start_time(self): + with pytest.raises(ValueError): + Profile(profiling_group_name="foo", sampling_interval_seconds=1.0, host_weight=2, start=0) + with pytest.raises(ValueError): + Profile(profiling_group_name="foo", sampling_interval_seconds=1.0, host_weight=2, start=-100) + with pytest.raises(TypeError): + Profile(profiling_group_name="foo", sampling_interval_seconds=1.0, host_weight=2, start=None) + + +class TestEndTime(TestProfile): + @before + def before(self): + super().before() + + def test_it_sets_valid_end_time(self): + test_end_time = self.test_start_time + 1 + self.subject.end = test_end_time + assert self.subject.end == test_end_time + + def test_it_sets_invalid_end_time(self): + test_end_time = self.test_start_time - 1 + with pytest.raises(ValueError): + self.subject.end = test_end_time + with pytest.raises(TypeError): + self.subject.end = None + + +class TestPause(TestProfile): + @before + def before(self): + super().before() + + def test_it_sets_last_resume_to_none(self): + assert self.subject.last_resume is not None + self.subject.pause() + assert self.subject.last_resume is None + assert self.subject.last_pause is not None + + def test_when_last_pause_is_not_none_it_returns(self): + self.subject.last_pause = self.test_start_time + self.subject.pause() + + def test_when_last_pause_is_not_none_it_does_not_update_last_pause(self): + assert self.subject.last_pause is None + self.subject.last_pause = self.test_start_time + self.turn_clock(seconds=1) + self.subject.pause() + assert self.subject.last_pause == self.test_start_time + + +class TestResume(TestProfile): + @before + def before(self): + super().before() + + def test_it_sets_last_resume(self): + self.turn_clock(seconds=1) + self.subject.pause() + assert self.subject.last_resume is None + self.turn_clock(seconds=1) + self.subject.resume() + assert self.subject.last_resume == self.test_start_time + 2000 + assert self.subject.last_pause is None + + +class TestWhenLastResumeIsNotNone(TestProfile): + @before + def before(self): + super().before() + + def test_it_returns(self): + self.subject.resume() + + def test_it_does_not_update_last_resume(self): + assert self.subject.last_resume == self.test_start_time + self.subject.resume() + assert self.subject.last_resume == self.test_start_time + + +class TestGetActiveMillisSinceStart(TestProfile): + @before + def before(self): + super().before() + + def test_it_return_duration_since_start(self): + self.turn_clock(seconds=4) + assert self.subject.get_active_millis_since_start() == 4000 + + +class TestWhenStillPaused(TestProfile): + @before + def before(self): + super().before() + self.turn_clock(seconds=9) + self.subject.pause() + self.turn_clock(seconds=3) + + def test_it_returns_duration_up_to_last_pause_time(self): + assert self.subject.get_active_millis_since_start() == 9000 + + +class TestWhenPauseAndResumeWereCalled(TestProfile): + @before + def before(self): + super().before() + self.turn_clock(seconds=9) + self.subject.pause() + self.turn_clock(seconds=3) + self.subject.resume() + self.turn_clock(seconds=1) + + def test_it_returns_duration_minus_paused_time(self): + assert self.subject.get_active_millis_since_start() == 10000 + + +class TestWhenEndHasBeenSet(TestProfile): + @before + def before(self): + super().before() + self.turn_clock(seconds=9) + self.subject.pause() + self.turn_clock(seconds=3) + self.subject.resume() + self.turn_clock(seconds=5) + self.subject.end = self.mock_clock() * 1000 + self.turn_clock(seconds=1) # that extra second should not be included in the duration + + def test_it_returns_duration_up_to_end_time(self): + assert self.subject.get_active_millis_since_start() == 14000 + + +class TestSetOverhead(TestProfile): + @before + def before(self): + super().before() + + def test_it_overrides_the_overhead_duration(self): + self.subject.set_overhead_ms(duration_timedelta=datetime.timedelta(seconds=5.0)) + self.subject.set_overhead_ms(duration_timedelta=datetime.timedelta(seconds=37.0)) + assert self.subject.overhead_ms == 37000 + + +class TestInit(TestProfile): + @before + def before(self): + super().before() + + def test_root_node_frame_is_ALL(self): + assert (self.subject.callgraph.frame_name == "ALL") + + def test_set_metadata_correctly(self): + assert (self.subject.sampling_interval_ms == 1000) + assert (isinstance(self.subject.sampling_interval_ms, int)) + assert (self.subject.host_weight == 2) + assert (isinstance(self.subject.host_weight, int)) + + def test_it_initializes_the_total_attempted_sample_threads_count_to_zero( + self): + assert (self.subject.total_attempted_sample_threads_count == 0) + + def test_it_initializes_the_total_seen_threads_count_to_zero(self): + assert (self.subject.total_seen_threads_count == 0) + + def test_when_profile_is_empty_it_returns_1(self): + assert (self.subject.average_thread_weight() == 1.0) + + +class TestGetAverageThreadWeight(TestProfile): + @before + def before(self): + super().before() + + def test_it_returns_the_average_thread_weight_for_the_samples_in_the_profile( + self): + sample = Sample(stacks=[[Frame("frame1")]], attempted_sample_threads_count=10, seen_threads_count=15) + + self.subject.add(sample) + + assert (self.subject.average_thread_weight() == 1.5) + + +def _convert_profile_into_dict(profile): + return _convert_node_into_dict(profile.callgraph) + + +def _convert_node_into_dict(node): + node_in_dict = { + "count": node.runnable_count, + "children": + {node.frame_name: _convert_node_into_dict(node) for node in node.children} + } + + if node.file_path is not None: + node_in_dict["file"] = node.file_path + if node.class_name is not None: + node_in_dict["class_name"] = node.class_name + if node.start_line is not None: + if node.start_line == node.end_line: + node_in_dict["line"] = [node.start_line] + else: + node_in_dict["line"] = [node.start_line, node.end_line] + + return node_in_dict diff --git a/test/unit/reporter/test_agent_configuration.py b/test/unit/reporter/test_agent_configuration.py new file mode 100644 index 0000000..eafdc1a --- /dev/null +++ b/test/unit/reporter/test_agent_configuration.py @@ -0,0 +1,103 @@ +import pytest +from datetime import timedelta +from test.pytestutils import before +import codeguru_profiler_agent.reporter.agent_configuration +from codeguru_profiler_agent.reporter.agent_configuration import AgentConfiguration + + +class TestAgentConfiguration: + + def before(self): + self.agent_config = AgentConfiguration( + should_profile=True, + sampling_interval=timedelta(milliseconds=9100), + minimum_time_reporting=timedelta(seconds=7), + reporting_interval=timedelta(minutes=7), + max_stack_depth=999, + cpu_limit_percentage=9 + ) + AgentConfiguration.set(self.agent_config) + + +class TestInit(TestAgentConfiguration): + @before + def before(self): + super().before() + + def test_it_sets_values_from_constructor(self): + assert self.agent_config.should_profile is True + assert self.agent_config.sampling_interval == timedelta(milliseconds=9100) + assert self.agent_config.reporting_interval == timedelta(minutes=7) + assert self.agent_config.minimum_time_reporting == timedelta(seconds=7) + assert self.agent_config.max_stack_depth == 999 + assert self.agent_config.cpu_limit_percentage == 9 + + def test_when_customer_reporting_interval_is_less_than_30_seconds_it_raises_a_value_error(self): + with pytest.raises(ValueError): + AgentConfiguration(reporting_interval=timedelta(seconds=28), + minimum_time_reporting=timedelta(seconds=29)) + + def test_it_throws_error_at_calling_get_when_singleton_is_none(self): + setattr(codeguru_profiler_agent.reporter.agent_configuration, "_singleton", None) + with pytest.raises(ValueError): + AgentConfiguration.get() + + def test_set_throws_error_when_setting_a_none_instance(self): + with pytest.raises(ValueError): + AgentConfiguration.set(None) + + def test_it_sets_the_values_after_updating_using_new_configuration(self): + new_config = self.agent_config._get_new_config(configure_agent_response={ + "agentParameters": { + "SamplingIntervalInMilliseconds": "2100", + "MinimumTimeForReportingInMilliseconds": "60000", + "MaxStackDepth": "1001", + "MemoryUsageLimitPercent": "10" + }, + "periodInSeconds": 123, + "shouldProfile": False + }) + assert new_config.should_profile is False + assert new_config.sampling_interval == timedelta(seconds=2.1) + assert new_config.minimum_time_reporting == timedelta(milliseconds=60000) + assert new_config.reporting_interval == timedelta(seconds=123) + assert new_config.max_stack_depth == 1001 + assert new_config.cpu_limit_percentage == 10 + + +class TestGetValuesFromResponse(TestAgentConfiguration): + @before + def before(self): + super().before() + self.response = {"one": 1, "two": "2", "three": "not a number"} + self.default_value = timedelta(seconds=999) + + def test_get_int_value_from(self): + assert AgentConfiguration._get_int_value_from("one", self.response) == 1 + assert AgentConfiguration._get_int_value_from("two", self.response) == 2 + assert AgentConfiguration._get_int_value_from("three", self.response) is None + assert AgentConfiguration._get_int_value_from("three", self.response, 0) == 0 + assert AgentConfiguration._get_int_value_from("four", self.response) is None + assert AgentConfiguration._get_int_value_from("four", self.response, 999) == 999 + + def test_get_interval_from(self): + assert AgentConfiguration._get_interval_from("one", self.response, self.default_value, + in_seconds=True) == timedelta(seconds=1) + assert AgentConfiguration._get_interval_from("one", self.response, self.default_value, + in_milliseconds=True) == timedelta(milliseconds=1) + assert AgentConfiguration._get_interval_from("three", self.response, self.default_value, + in_milliseconds=True) == self.default_value + + def test_get_interval_from_raises_error_if_both_true(self): + with pytest.raises(ValueError): + AgentConfiguration._get_interval_from("one", self.response, self.default_value, + in_seconds=True, in_milliseconds=True) + + def test_get_interval_from_raises_error_if_both_false(self): + with pytest.raises(ValueError): + AgentConfiguration._get_interval_from("one", self.response, self.default_value, + in_seconds=False, in_milliseconds=False) + + def test_get_interval_from_raises_error_if_both_are_the_defaults(self): + with pytest.raises(ValueError): + AgentConfiguration._get_interval_from("one", self.response, self.default_value) diff --git a/test/unit/reporter/test_agent_configuration_merger.py b/test/unit/reporter/test_agent_configuration_merger.py new file mode 100644 index 0000000..2ebcfbe --- /dev/null +++ b/test/unit/reporter/test_agent_configuration_merger.py @@ -0,0 +1,90 @@ +from test.pytestutils import before + +from datetime import timedelta + +from codeguru_profiler_agent.reporter.agent_configuration import AgentConfiguration, AgentConfigurationMerger + + +class TestAgentConfigurationMerger: + + @before + def before(self): + self.config = AgentConfiguration( + should_profile=True, + sampling_interval=timedelta(milliseconds=1), + minimum_time_reporting=timedelta(seconds=1), + reporting_interval=timedelta(minutes=1), + max_stack_depth=998) + self.overide_config = AgentConfiguration(sampling_interval=timedelta(seconds=9)) + self.configure_agent_response = { + "agentParameters": { + "SamplingIntervalInMilliseconds": "2000", + "MinimumTimeForReportingInMilliseconds": "21000", + "MaxStackDepth": "1001" + }, + "periodInSeconds": 123, + "shouldProfile": False + } + + def test_default_values_are_overridden_at_merge_with(self): + agent_config_merger = AgentConfigurationMerger(default=self.config) + self.assert_init_values() + + agent_config_merger.merge_with(configure_agent_response=self.configure_agent_response) + assert AgentConfiguration.get().should_profile is False + assert AgentConfiguration.get().sampling_interval == timedelta(milliseconds=2000) + assert AgentConfiguration.get().minimum_time_reporting == timedelta(milliseconds=21000) + assert AgentConfiguration.get().reporting_interval == timedelta(seconds=123) + assert AgentConfiguration.get().max_stack_depth == 1001 + + def test_user_overrides_are_not_overridden_at_merge_with(self): + agent_config_merger = AgentConfigurationMerger(default=AgentConfiguration(), user_overrides=self.config) + self.assert_init_values() + + agent_config_merger.merge_with(configure_agent_response=self.configure_agent_response) + self.assert_init_values() + + def test_a_user_override_is_not_overridden_at_merge(self): + agent_config_merger = AgentConfigurationMerger(default=self.config, + user_overrides=self.overide_config) + assert AgentConfiguration.get().should_profile is True + assert AgentConfiguration.get().sampling_interval == timedelta(seconds=9) + assert AgentConfiguration.get().minimum_time_reporting == timedelta(seconds=1) + assert AgentConfiguration.get().reporting_interval == timedelta(minutes=1) + assert AgentConfiguration.get().max_stack_depth == 998 + + agent_config_merger.merge_with(configure_agent_response=self.configure_agent_response) + assert AgentConfiguration.get().should_profile is False + assert AgentConfiguration.get().sampling_interval == timedelta(seconds=9) + assert AgentConfiguration.get().minimum_time_reporting == timedelta(milliseconds=21000) + assert AgentConfiguration.get().reporting_interval == timedelta(seconds=123) + assert AgentConfiguration.get().max_stack_depth == 1001 + + def assert_init_values(self): + assert AgentConfiguration.get().should_profile is True + assert AgentConfiguration.get().sampling_interval == timedelta(milliseconds=1) + assert AgentConfiguration.get().minimum_time_reporting == timedelta(seconds=1) + assert AgentConfiguration.get().reporting_interval == timedelta(minutes=1) + assert AgentConfiguration.get().max_stack_depth == 998 + + class TestDisableProfiling: + @before + def before(self): + self.config = AgentConfiguration( + should_profile=True, + sampling_interval=timedelta(milliseconds=1), + minimum_time_reporting=timedelta(seconds=1), + reporting_interval=timedelta(minutes=1), + max_stack_depth=998) + self.agent_config_merger = AgentConfigurationMerger(default=self.config) + + def test_it_sets_should_profile_to_false(self): + self.agent_config_merger.disable_profiling() + assert AgentConfiguration.get().should_profile is False + + def test_it_leaves_other_values_untouched(self): + self.agent_config_merger.disable_profiling() + assert AgentConfiguration.get().sampling_interval == timedelta(milliseconds=1) + assert AgentConfiguration.get().minimum_time_reporting == timedelta(seconds=1) + assert AgentConfiguration.get().reporting_interval == timedelta(minutes=1) + assert AgentConfiguration.get().max_stack_depth == 998 \ No newline at end of file diff --git a/test/unit/sdk_reporter/test_sdk_profile_encoder.py b/test/unit/sdk_reporter/test_sdk_profile_encoder.py new file mode 100644 index 0000000..6c18045 --- /dev/null +++ b/test/unit/sdk_reporter/test_sdk_profile_encoder.py @@ -0,0 +1,367 @@ +# -*- coding: utf-8 -*- +import platform + +import pytest +from mock import MagicMock + +from codeguru_profiler_agent.agent_metadata.agent_metadata import AgentMetadata +from codeguru_profiler_agent.agent_metadata.aws_ec2_instance import AWSEC2Instance +from codeguru_profiler_agent.model.frame import Frame +from test.pytestutils import before +import json +import io +import gzip +from datetime import timedelta + +from codeguru_profiler_agent.metrics.timer import Timer +from codeguru_profiler_agent.model.profile import Profile +from codeguru_profiler_agent.model.sample import Sample +from codeguru_profiler_agent.sdk_reporter.profile_encoder import ProfileEncoder + + +def example_profile(): + start_time = 1514764800000 + end_time = 1514772000000 + profile = Profile(profiling_group_name="TestProfilingGroupName", sampling_interval_seconds=1.0, host_weight=2, + start=start_time, clock=lambda: 1514772000000) + profile.add( + Sample(stacks=[[Frame("bottom"), Frame("middle"), Frame("top")], + [Frame("bottom"), Frame("middle"), Frame("different_top")], + [Frame("bottom"), Frame("middle")]], attempted_sample_threads_count=10, seen_threads_count=15)) + profile.end = end_time + profile.set_overhead_ms(timedelta(milliseconds=256)) + if platform.system() == "Windows": + # In Windows, as time.process stays constant if no cpu time was used (https://bugs.python.org/issue37859), we + # would need to manually override the cpu_time_seconds to ensure the test runs as expected + profile.cpu_time_seconds = 0.123 + return profile + + +agent_metadata = AgentMetadata( + fleet_info=AWSEC2Instance(host_name="testHostName", host_type="testHostType") +) + +environment = { + "timer": Timer(), + "agent_metadata": agent_metadata +} + + +class TestSdkProfileEncoder: + def before(self): + self.profile = example_profile() + self.output_stream = io.BytesIO() + self.subject = \ + ProfileEncoder(gzip=False, environment=environment) + self.decoded_json_result = self.decoded_json_result.__get__(self) + self._decoded_json_result = None + + def decoded_json_result(self): + if not self._decoded_json_result: + self.subject.encode( + profile=self.profile, output_stream=self.output_stream) + self._decoded_json_result = json.loads( + self.output_stream.getvalue().decode("utf-8")) + return self._decoded_json_result + + +class TestEncode(TestSdkProfileEncoder): + @before + def before(self): + super().before() + + def test_it_encodes_the_result_as_a_json_file(self): + assert (type(self.decoded_json_result()) is dict) + + +class TestInsideTheResult(TestSdkProfileEncoder): + @before + def before(self): + super().before() + + def test_it_includes_the_start_time_from_the_profile_in_epoch_millis( + self): + assert (self.decoded_json_result()["start"] == 1514764800000) + + def test_it_includes_the_end_time_from_the_profile_in_epoch_millis( + self): + assert (self.decoded_json_result()["end"] == 1514772000000) + + def test_it_includes_the_agent_info_in_the_agent_metadata(self): + assert (self.decoded_json_result()["agentMetadata"]["agentInfo"]["type"] == + agent_metadata.agent_info.agent_type) + assert (self.decoded_json_result()["agentMetadata"]["agentInfo"]["version"] == + agent_metadata.agent_info.version) + + def test_it_includes_the_runtime_version_in_the_agent_metadata(self): + assert (self.decoded_json_result()["agentMetadata"]["runtimeVersion"] == + agent_metadata.runtime_version) + + def test_it_includes_the_fleet_info_in_the_agent_metadata(self): + assert (self.decoded_json_result()["agentMetadata"]["fleetInfo"] == + agent_metadata.fleet_info.serialize_to_map()) + + def test_it_includes_the_sample_weight_in_the_agent_metadata(self): + # Given the example profile, sample_weight = 1 / (1514772000000 - 1514764800000) = ~0.00013888 + assert (0.000138 < self.decoded_json_result()["agentMetadata"]["sampleWeights"]["WALL_TIME"] < 0.000139) + + def test_it_includes_profile_duration_in_the_agent_metadata(self): + assert (self.decoded_json_result()["agentMetadata"]["durationInMs"] == 7200000) + + def test_it_includes_the_cpu_time_in_the_agent_metadata(self): + assert (self.decoded_json_result()["agentMetadata"]["cpuTimeInSeconds"] > 0) + + def test_it_includes_the_num_threads_in_the_agent_metadata(self): + assert (self.decoded_json_result()["agentMetadata"]["metrics"]["numThreads"] == 15) + + def test_it_includes_the_overhead_ms_in_the_agent_metadata(self): + assert (self.decoded_json_result()["agentMetadata"]["agentOverhead"]["timeInMs"] == 256) + + def test_it_includes_the_memory_overhead_in_the_agent_metadata(self): + assert (self.decoded_json_result()["agentMetadata"]["agentOverhead"]["memory_usage_mb"] > 0) + + def test_it_handles_unicode_frames_correctly(self): + self.profile.add( + Sample(stacks=[[Frame("unicode_bottom"), Frame(u"😉"), Frame(u"🙃")]])) + + assert (self.decoded_json_result()["callgraph"]["children"][ + "unicode_bottom"] == { + "children": { + u"😉": { + "children": { + u"🙃": { + "counts": { + "WALL_TIME": 1 + } + } + } + } + } + }) + + @pytest.mark.skipif(platform.system() != "Windows", reason="This test should only be run on Windows") + def test_it_handles_unicode_escape_correctly_on_Windows(self): + self.profile.add( + Sample(stacks=[[Frame("bottom_with_path"), + Frame("middle", file_path="C:/User/ironman/path/xs.py", class_name="ClassA"), + Frame("top", file_path="C:\\User\\ironman\\path\\xs.py", class_name="ClassA")]]) + ) + + assert (self.decoded_json_result()["callgraph"]["children"]["bottom_with_path"] == + { + "children": { + "User.ironman.path.xs:ClassA:middle": { + "children": { + "User.ironman.path.xs:ClassA:top": { + "file": "C:\\User\\ironman\\path\\xs.py", + "counts": { + "WALL_TIME": 1 + } + } + }, + "file": "C:\\User\\ironman\\path\\xs.py" + } + } + }) + + @pytest.mark.skipif(platform.system() == "Windows", reason="This test should not be run on Windows") + def test_it_handles_unicode_escape_correctly_on_non_Windows_system(self): + self.profile.add( + Sample(stacks=[[Frame("bottom_with_path"), + Frame("top", file_path="C:\\User\\ironman\\path\\xs.py", class_name="ClassA")]]) + ) + + assert (self.decoded_json_result()["callgraph"]["children"]["bottom_with_path"] == + { + "children": { + "C:\\User\\ironman\\path\\xs:ClassA:top": { + 'file': 'C:\\User\\ironman\\path\\xs.py', + "counts": { + "WALL_TIME": 1 + } + } + } + }) + + @pytest.mark.skipif(platform.system() == "Windows", reason="This test should not be run on Windows") + def test_it_includes_correct_file_path_when_available_on_non_Windows_system(self): + self.profile.add( + Sample(stacks=[[Frame("bottom_with_path", file_path="path/file1.py"), + Frame("middle_with_path", file_path="path/file2.py"), + Frame("top_without_path")]])) + + assert (self.decoded_json_result()["callgraph"]["children"]["path.file1:bottom_with_path"] == + { + "children": { + "path.file2:middle_with_path": { + "children": { + "top_without_path": { + "counts": { + "WALL_TIME": 1 + } + } + }, + 'file': 'path/file2.py' + } + }, + 'file': 'path/file1.py' + }) + + @pytest.mark.skipif(platform.system() != "Windows", reason="This test should not be run on Windows") + def test_it_includes_correct_file_path_when_available_on_Windows(self): + self.profile.add( + Sample(stacks=[[Frame("bottom_with_path", file_path="path/file1.py"), + Frame("middle_with_path", file_path="path/file2.py"), + Frame("top_without_path")]])) + + assert (self.decoded_json_result()["callgraph"]["children"]["path.file1:bottom_with_path"] == + { + "children": { + "path.file2:middle_with_path": { + "children": { + "top_without_path": { + "counts": { + "WALL_TIME": 1 + } + } + }, + 'file': 'path\\file2.py' + } + }, + 'file': 'path\\file1.py' + }) + + def test_it_includes_class_name_when_available(self): + self.profile.add( + Sample(stacks=[[Frame("bottom_with_path", class_name="ClassA"), + Frame("middle_with_path", class_name="ClassB"), + Frame("top_without_path")]])) + + assert (self.decoded_json_result()["callgraph"]["children"]["ClassA:bottom_with_path"] == + { + "children": { + "ClassB:middle_with_path": { + "children": { + "top_without_path": { + "counts": { + "WALL_TIME": 1 + } + } + } + } + } + }) + + def test_it_includes_line_when_available(self): + self.profile.add( + Sample(stacks=[[Frame("bottom_with_line_no", line_no=123), + Frame("middle_with_line_no", line_no=234), + Frame("top_without_line_no")], + [Frame("bottom_with_line_no", line_no=123), + Frame("middle_with_line_no", line_no=345), + Frame("top_without_line_no")] + ])) + + assert (self.decoded_json_result()["callgraph"]["children"]["bottom_with_line_no"] == + { + "children": { + "middle_with_line_no": { + "children": { + "top_without_line_no": { + "counts": { + "WALL_TIME": 2 + } + } + }, + "line": [234, 345] + } + }, + "line": [123] + }) + + def test_it_includes_the_call_graph_in_self_time_mode(self): + assert (self.decoded_json_result()["callgraph"] == { + "children": { + "bottom": { + "children": { + "middle": { + "counts": { + "WALL_TIME": 1 + }, + "children": { + "top": { + "counts": { + "WALL_TIME": 1 + } + }, + "different_top": { + "counts": { + "WALL_TIME": 1 + } + } + } + } + } + } + } + }) + + +class TestWhenGzippingIsEnabled(TestSdkProfileEncoder): + @before + def before(self): + super().before() + + def test_it_gzips_the_result_before_writing_to_the_stream(self): + ProfileEncoder(gzip=True, environment=environment).encode( + profile=self.profile, output_stream=self.output_stream) + + self.output_stream.seek(0) + uncompressed_result = gzip.GzipFile( + fileobj=self.output_stream, mode="rb").read() + + assert (len(uncompressed_result) > 0) + + +class TestModulePathExtractor: + @before + def before(self): + self.subject = ProfileEncoder(gzip=False, environment=environment).ModulePathExtractor( + sys_path=["/tmp/TestPythonAgent/site-package/", "\\tmp\\TestPythonAgent\\site-package\\"]) + + def test_it_removes_root_path(self): + assert self.subject \ + .get_module_path("/tmp/TestPythonAgent/site-package/DummyPackage/dummy") == \ + "DummyPackage.dummy" + + def test_it_returns_same_path_if_no_match_from_sys_paths(self): + assert self.subject \ + .get_module_path("this/is/clearly/not/in/sys/path/dummy.py") == \ + "this.is.clearly.not.in.sys.path.dummy" + + def test_it_removes_longest_root_path_matched_from_sys_path(self): + subject = ProfileEncoder(gzip=False, environment=environment).ModulePathExtractor( + sys_path=["/tmp/TestPythonAgent/site-package/", "/tmp/TestPythonAgent/site-package/threading/", + "\\tmp\\TestPythonAgent\\site-package\\", "\\tmp\\TestPythonAgent\\site-package\\threading\\"]) + + assert subject.get_module_path("/tmp/TestPythonAgent/site-package/threading/DummyPackage/dummy") == \ + "DummyPackage.dummy" + + def test_it_caches_result(self): + self.dummy_module_extract = MagicMock("dummy_module_extractor") + self.subject = ProfileEncoder(gzip=False, environment=environment).ModulePathExtractor( + sys_path=["/tmp/TestPythonAgent/site-package/"], extractor_fun=self.dummy_module_extract) + + some_path = "/tmp/TestPythonAgent/site-package/DummyPackage/dummy.py" + self.subject.get_module_path(some_path) + self.subject.get_module_path(some_path) + + self.dummy_module_extract.assert_called_once_with( + "/tmp/TestPythonAgent/site-package/DummyPackage/dummy.py", + ["/tmp/TestPythonAgent/site-package/"] + ) + + def test_debug_pretty_encode_it_returns_a_json_representation_for_a_profile(self): + result = ProfileEncoder.debug_pretty_encode(profile=example_profile(), environment=environment) + + assert (json.loads(result)["start"] == 1514764800000) diff --git a/test/unit/sdk_reporter/test_sdk_reporter.py b/test/unit/sdk_reporter/test_sdk_reporter.py new file mode 100644 index 0000000..0113fcf --- /dev/null +++ b/test/unit/sdk_reporter/test_sdk_reporter.py @@ -0,0 +1,119 @@ +# -*- coding: utf-8 -*- +import boto3 + +from datetime import timedelta +from codeguru_profiler_agent.utils.time import current_milli_time +from test.pytestutils import before +from mock import MagicMock +from botocore.stub import Stubber, ANY + +from codeguru_profiler_agent.reporter.agent_configuration import AgentConfigurationMerger +from codeguru_profiler_agent.agent_metadata.agent_metadata import AgentMetadata, DefaultFleetInfo +from codeguru_profiler_agent.reporter.agent_configuration import AgentConfiguration +from codeguru_profiler_agent.sdk_reporter.sdk_reporter import SdkReporter +from codeguru_profiler_agent.sdk_reporter.profile_encoder import ProfileEncoder +from codeguru_profiler_agent.model.profile import Profile +from codeguru_profiler_agent.codeguru_client_builder import CodeGuruClientBuilder + +profiling_group_name = "test-ProfilingGroup-name" +profile = Profile(profiling_group_name, 1.0, 0.5, current_milli_time()) + + +class TestSdkReporter: + def before(self): + codeguru_client_builder = CodeGuruClientBuilder(environment={ + "aws_session": boto3.session.Session() + }) + + self.client_stubber = Stubber(codeguru_client_builder.codeguru_client) + + profile_encoder = MagicMock(name="profile_encoder", spec=ProfileEncoder) + profile_encoder.encode.side_effect = lambda **args: args["output_stream"].write( + b"test-profile-encoder-output") + self.environment = { + "profiling_group_name": profiling_group_name, + "profile_encoder": profile_encoder, + "codeguru_profiler_builder": codeguru_client_builder, + "agent_metadata": AgentMetadata(fleet_info=DefaultFleetInfo()), + "reporting_interval": timedelta(minutes=13), + "sampling_interval": timedelta(seconds=13), + "minimum_time_reporting": timedelta(minutes=13), + "max_stack_depth": 1300 + } + default_config = AgentConfiguration( + should_profile=True, + sampling_interval=self.environment["sampling_interval"], + reporting_interval=self.environment["reporting_interval"], + minimum_time_reporting=self.environment["minimum_time_reporting"]) + self.environment["agent_config_merger"] = AgentConfigurationMerger(default_config) + self.subject = SdkReporter(environment=self.environment) + self.subject.setup() + + +class TestReport(TestSdkReporter): + @before + def before(self): + super().before() + + def test_report_calls_the_client(self): + expected_params = { + 'agentProfile': ANY, + 'contentType': 'application/json', + 'profilingGroupName': profiling_group_name + } + self.client_stubber.add_response('post_agent_profile', {}, expected_params) + + with self.client_stubber: + assert self.subject.report(profile) is True + + def test_return_false_when_report_throws_error(self): + self.client_stubber.add_client_error('post_agent_profile', http_status_code=500, + service_message='Simulated error in post_agent_profile call') + with self.client_stubber: + assert self.subject.report(profile) is False + + +class TestConfigureAgent(TestSdkReporter): + @before + def before(self): + super().before() + + def test_configure_agent_calls_the_client(self): + response = { + 'configuration': { + 'agentParameters': { + 'SamplingIntervalInMilliseconds': '91000', + 'MinimumTimeForReportingInMilliseconds': '60000', + 'MaxStackDepth': '1001' + }, + 'periodInSeconds': 123, + 'shouldProfile': False + } + } + self.client_stubber.add_response('configure_agent', response) + with self.client_stubber: + self.subject.refresh_configuration() + assert AgentConfiguration.get().should_profile is False + assert AgentConfiguration.get().sampling_interval.total_seconds() == 91 + + def test_agent_configuration_when_configure_agent_throws_error(self): + self.client_stubber.add_client_error('configure_agent', http_status_code=500, + service_message='Simulated error in configure_agent call') + with self.client_stubber: + self.subject.refresh_configuration() + assert AgentConfiguration.get().should_profile is True + assert AgentConfiguration.get().sampling_interval == timedelta(seconds=13) + + def test_when_backends_sends_resource_not_found_it_stops_the_profiling(self): + self.client_stubber.add_client_error('configure_agent', service_error_code='ResourceNotFoundException', + service_message='Simulated error in configure_agent call') + with self.client_stubber: + self.subject.refresh_configuration() + assert AgentConfiguration.get().should_profile is False + + def test_when_backend_sends_validation_exception_it_stops_the_profiling(self): + self.client_stubber.add_client_error('configure_agent', service_error_code='ValidationException', + service_message='Simulated error in configure_agent call') + with self.client_stubber: + self.subject.refresh_configuration() + assert AgentConfiguration.get().should_profile is False diff --git a/test/unit/test_codeguru_client_builder.py b/test/unit/test_codeguru_client_builder.py new file mode 100644 index 0000000..a336850 --- /dev/null +++ b/test/unit/test_codeguru_client_builder.py @@ -0,0 +1,133 @@ +import os + +import boto3 +import pytest +from mock import MagicMock + +from test.pytestutils import before +from codeguru_profiler_agent.codeguru_client_builder import CodeGuruClientBuilder + + +class TestCodeGuruClientBuilder: + @before + def before(self): + default_session = boto3.session.Session() + self.subject = CodeGuruClientBuilder(environment={'aws_session': default_session}) + + def test_codeguru_client_is_not_built_at_initialization(self): + assert self.subject._codeguru_client_instance is None + + def test_codeguru_client_getter_builds_the_client(self): + # make sure the getter builds it + assert self.subject.codeguru_client is not None + assert self.subject._codeguru_client_instance is not None + + class TestWhenSessionIsProvided: + @before + def before(self): + self.subject = CodeGuruClientBuilder(environment={ + 'aws_session': boto3.session.Session(region_name='ap-southeast-2') + }) + + def test_session_region_is_used(self): + assert 'ap-southeast-2' in self.subject.codeguru_client._endpoint.host + + class TestWhenRegionProvidedInSessionDoesNotMatchWithRegionSetInEnvironment: + @pytest.fixture(autouse=True) + def around(self): + previous_region_value = os.environ.get('AWS_DEFAULT_REGION') + os.environ['AWS_DEFAULT_REGION'] = 'eu-west-2' + self.subject = CodeGuruClientBuilder(environment={ + 'aws_session': boto3.session.Session(region_name="ap-southeast-2") + }) + yield + if previous_region_value is not None: + os.environ['AWS_DEFAULT_REGION'] = previous_region_value + else: + del os.environ['AWS_DEFAULT_REGION'] + + def test_provided_region_is_used(self): + assert 'ap-southeast-2' in self.subject.codeguru_client._endpoint.host + + class TestWhenRegionIsProvided: + @before + def before(self): + self.subject = CodeGuruClientBuilder(environment={ + 'aws_session': boto3.session.Session(region_name='eu-west-2'), + 'region_name': 'eu-west-2' + }) + + def test_provided_region_is_used(self): + assert 'eu-west-2' in self.subject.codeguru_client._endpoint.host + + class TestWhenRegionProvidedDoesNotMatchWithSessionRegion: + @before + def before(self): + self.subject = CodeGuruClientBuilder(environment={ + 'aws_session': boto3.session.Session(region_name='eu-west-2'), + 'region_name': 'ap-southeast-2' + }) + + def test_provided_region_is_used(self): + assert 'ap-southeast-2' in self.subject.codeguru_client._endpoint.host + + class TestWhenRegionProvidedDoesNotMatchWithRegionSetInEnvironment: + @pytest.fixture(autouse=True) + def around(self): + previous_region_value = os.environ.get('AWS_DEFAULT_REGION') + os.environ['AWS_DEFAULT_REGION'] = 'eu-west-2' + self.subject = CodeGuruClientBuilder(environment={ + 'aws_session': boto3.session.Session(), + 'region_name': 'ap-southeast-2' + }) + yield + if previous_region_value is not None: + os.environ['AWS_DEFAULT_REGION'] = previous_region_value + else: + del os.environ['AWS_DEFAULT_REGION'] + + def test_provided_region_is_used(self): + assert 'ap-southeast-2' in self.subject.codeguru_client._endpoint.host + + class TestWhenRegionIsSetInEnvironment: + @pytest.fixture(autouse=True) + def around(self): + previous_region_value = os.environ.get('AWS_DEFAULT_REGION') + os.environ['AWS_DEFAULT_REGION'] = 'ap-southeast-2' + self.subject = CodeGuruClientBuilder(environment={ + 'aws_session': boto3.session.Session() + }) + yield + if previous_region_value is not None: + os.environ['AWS_DEFAULT_REGION'] = previous_region_value + else: + del os.environ['AWS_DEFAULT_REGION'] + + def test_correct_region_is_used(self): + assert 'ap-southeast-2' in self.subject.codeguru_client._endpoint.host + + class TestWhenEndpointIsProvided: + @before + def before(self): + self.subject = CodeGuruClientBuilder(environment={ + 'aws_session': boto3.session.Session(), + 'endpoint_url': 'https://some-endpoint.amazonaws.com' + }) + + def test_provided_region_is_used(self): + assert self.subject.codeguru_client._endpoint.host == 'https://some-endpoint.amazonaws.com' + + class TestWhenAwsSessionIsProvided: + @before + def before(self): + self.mock_session = MagicMock(name="profiler", spec=boto3.session.Session) + self.subject = CodeGuruClientBuilder( + environment={ + 'aws_session': self.mock_session + } + ) + # it is necessary to call codeguru_client; otherwise no session will be created. + self.subject.codeguru_client + + def test_it_creates_client_with_given_session(self): + self.mock_session.client.assert_called_once() diff --git a/test/unit/test_local_aggregator.py b/test/unit/test_local_aggregator.py new file mode 100644 index 0000000..b1aec6a --- /dev/null +++ b/test/unit/test_local_aggregator.py @@ -0,0 +1,376 @@ +import pytest + +from datetime import timedelta + +from codeguru_profiler_agent.reporter.agent_configuration import AgentConfiguration +from codeguru_profiler_agent.utils.time import current_milli_time +from test.pytestutils import before +from mock import MagicMock, call + +from codeguru_profiler_agent.profiler import DEFAULT_REPORTING_INTERVAL, \ + DEFAULT_MEMORY_LIMIT_BYTES, Profiler, INITIAL_MINIMUM_REPORTING_INTERVAL, DEFAULT_SAMPLING_INTERVAL +from codeguru_profiler_agent.sdk_reporter.sdk_reporter import SdkReporter +from codeguru_profiler_agent.local_aggregator import LocalAggregator, OverMemoryLimitException +from codeguru_profiler_agent.metrics.timer import Timer +from codeguru_profiler_agent.metrics.metric import Metric +from codeguru_profiler_agent.model.profile import Profile +from codeguru_profiler_agent.model.sample import Sample + +CURRENT_TIME_FOR_TESTING_MILLI = 1528887859058 +CURRENT_TIME_FOR_TESTING_SECOND = 1528887859.058 + +TEST_PROFILING_GROUP_NAME = "test-application" +TEST_SAMPLING_INTERVAL = 0.5 +TEST_HOST_WEIGHT = 1.0 + +ONE_SECOND = timedelta(seconds=1) + + +def mock_timer(): + mock_timer = MagicMock(name="timer", spec=Timer) + mock_timer.get_metric.return_value = None + return mock_timer + + +def assert_profile_is_reset(profile_factory, clock): + profile_factory.assert_called_once_with( + profiling_group_name=TEST_PROFILING_GROUP_NAME, host_weight=TEST_HOST_WEIGHT, + sampling_interval_seconds=TEST_SAMPLING_INTERVAL, start=current_milli_time(clock), clock=clock) + + +class TestLocalAggregator: + + def before(self): + self.mock_reporter = MagicMock(name="reporter", spec=SdkReporter) + self.mock_profile = MagicMock(name="profile", spec=Profile) + self.mock_profile_factory = MagicMock( + name="profile_factory", + spec=Profile, + return_value=self.mock_profile) + self.timer = mock_timer() + self.time_now = CURRENT_TIME_FOR_TESTING_SECOND + self.clock = lambda: self.time_now + self.reporting_interval = DEFAULT_REPORTING_INTERVAL + + self.environment = { + "profiling_group_name": TEST_PROFILING_GROUP_NAME, + "sampling_interval": timedelta(seconds=TEST_SAMPLING_INTERVAL), + "host_weight": TEST_HOST_WEIGHT, + "profile_factory": self.mock_profile_factory, + "memory_limit_bytes": DEFAULT_MEMORY_LIMIT_BYTES, + "clock": self.clock, + "timer": self.timer, + } + + self.configuration = { + "reporter": self.mock_reporter, + "environment": self.environment, + } + + AgentConfiguration.set( + AgentConfiguration( + should_profile=True, + sampling_interval=timedelta(seconds=TEST_SAMPLING_INTERVAL), + minimum_time_reporting=INITIAL_MINIMUM_REPORTING_INTERVAL, + reporting_interval=self.reporting_interval, + max_stack_depth=999, + cpu_limit_percentage=10 + ) + ) + + assert len(self.environment.keys()) == 7 + self.profiler = Profiler(profiling_group_name=TEST_PROFILING_GROUP_NAME, environment_override=self.environment) + assert len(self.environment.keys()) == 6 + + self.subject = LocalAggregator(**self.configuration) + self.mock_profile_factory.reset_mock() + self.timer.reset_mock() + + def move_clock_to(duration_timedelta): + self.time_now = \ + CURRENT_TIME_FOR_TESTING_SECOND + duration_timedelta.total_seconds() + + self.move_clock_to = move_clock_to + + +class TestAdd(TestLocalAggregator): + @before + def before(self): + super().before() + self.mock_profile.get_memory_usage_bytes = MagicMock( + return_value=DEFAULT_MEMORY_LIMIT_BYTES - 1) + self.sample = Sample([["method1", "method2"]]) + self.move_clock_to(self.reporting_interval - ONE_SECOND) + + def test_adding_sample_to_profile_successfully(self): + self.subject.add(self.sample) + + self.mock_profile.add.assert_called_once_with(self.sample) + + def test_exception_raised_is_propagated(self): + self.mock_profile.add.side_effect = ValueError("Foo") + with pytest.raises(ValueError): + self.subject.add(self.sample) + + +class TestFlushWhenReportingIntervalReached(TestLocalAggregator): + @before + def before(self): + super().before() + self.mock_profile.get_memory_usage_bytes = MagicMock( + return_value=DEFAULT_MEMORY_LIMIT_BYTES - 1) + self.move_clock_to(self.reporting_interval + ONE_SECOND) + self.mock_profile.is_empty = MagicMock( + return_value=False) + + def test_it_reports(self): + self.mock_reporter.report.return_value = True + self.subject.flush() + self.mock_reporter.report.assert_called_once_with(self.subject.profile) + + def test_it_returns_true(self): + self.mock_reporter.report.return_value = True + assert self.subject.flush() + + def test_profile_gets_reset(self): + self.mock_reporter.report.return_value = True + + self.subject.flush() + + assert_profile_is_reset(self.mock_profile_factory, self.clock) + + def test_timer_gets_reset(self): + self.mock_reporter.report.return_value = True + + self.subject.flush() + + self.timer.reset.assert_called_once() + + def test_it_adds_the_run_profiler_overhead_metric_to_the_profile( + self): + run_profiler_metric = Metric() + run_profiler_metric.add(0.1) + self.timer.get_metric.side_effect = \ + lambda metric_name: metric_name == "runProfiler" and run_profiler_metric + + self.subject.flush() + + self.mock_profile.set_overhead_ms.assert_called_once_with(duration_timedelta=timedelta(seconds=0.1)) + + +class TestFlushWhenProfileIsEmpty(TestLocalAggregator): + @before + def before(self): + super().before() + self.mock_profile.get_memory_usage_bytes = MagicMock( + return_value=DEFAULT_MEMORY_LIMIT_BYTES - 1) + self.move_clock_to(self.reporting_interval + ONE_SECOND) + self.mock_profile.is_empty = MagicMock( + return_value=False) + self.mock_profile.is_empty = MagicMock(return_value=True) + + def test_it_does_not_report(self): + self.subject.flush() + + self.mock_reporter.report.assert_not_called() + + def test_it_resets_the_profile(self): + self.subject.flush() + + assert_profile_is_reset(self.mock_profile_factory, self.clock) + + def test_it_resets_the_timer(self): + self.subject.flush() + + self.timer.reset.assert_called_once() + + +class TestFlushWhenReportFailed(TestLocalAggregator): + @before + def before(self): + super().before() + self.mock_profile.get_memory_usage_bytes = MagicMock( + return_value=DEFAULT_MEMORY_LIMIT_BYTES - 1) + self.move_clock_to(self.reporting_interval + ONE_SECOND) + self.mock_profile.is_empty = MagicMock( + return_value=False) + self.mock_reporter.report.return_value = False + + def test_it_resets_the_profile(self): + self.subject.flush() + + assert_profile_is_reset(self.mock_profile_factory, self.clock) + + def test_it_resets_the_timer(self): + self.subject.flush() + + self.timer.reset.assert_called_once() + + +class TestFlushWhenRetryingReport(TestLocalAggregator): + @before + def before(self): + super().before() + self.mock_profile.get_memory_usage_bytes = MagicMock( + return_value=DEFAULT_MEMORY_LIMIT_BYTES - 1) + self.move_clock_to(self.reporting_interval + ONE_SECOND) + self.mock_profile.is_empty = MagicMock( + return_value=False) + self.mock_reporter.report.return_value = False + + def test_it_updates_the_profiler_overhead_metric_on_the_profile( + self): + run_profiler_metric = Metric() + run_profiler_metric.add(0.1) + self.timer.get_metric.side_effect = \ + lambda metric_name: metric_name == "runProfiler" and run_profiler_metric + + self.subject._is_over_reporting_interval = MagicMock(returnValue=False) + # first failed report + self.subject.flush() + + # some more time is added to the metric + run_profiler_metric.add(0.3) + + # try to report again + self.time_now += timedelta(minutes=10).total_seconds() + self.subject.flush() + self.mock_profile.set_overhead_ms.assert_has_calls([ + call(duration_timedelta=timedelta(seconds=0.1)), + call(duration_timedelta=timedelta(seconds=0.4)) + ]) + + +class TestFlushWhenReportingIntervalNotReached(TestLocalAggregator): + @before + def before(self): + super().before() + self.mock_profile.get_memory_usage_bytes = MagicMock( + return_value=DEFAULT_MEMORY_LIMIT_BYTES - 1) + self.move_clock_to(self.reporting_interval - ONE_SECOND) + self.mock_profile.is_empty = MagicMock( + return_value=False) + + def test_it_does_not_report(self): + self.mock_reporter.report.return_value = True + self.mock_reporter.report.assert_not_called() + + def test_it_returns_false(self): + self.mock_reporter.report.return_value = True + assert not self.subject.flush() + + +class TestForceFlush(TestLocalAggregator): + + def before(self): + super().before() + self.mock_profile.get_memory_usage_bytes.return_value = DEFAULT_MEMORY_LIMIT_BYTES - 1 + self.mock_profile.is_empty = MagicMock(return_value=False) + + +class TestWhenMinReportingTimeNotReached(TestForceFlush): + @before + def before(self): + super().before() + self.move_clock_to(INITIAL_MINIMUM_REPORTING_INTERVAL - ONE_SECOND) + self.mock_profile_factory.reset_mock() + + def test_it_does_not_report(self): + self.subject.flush(force=True) + + self.mock_reporter.report.assert_not_called() + + def test_profile_gets_reset(self): + self.subject.flush(force=True) + + assert_profile_is_reset(self.mock_profile_factory, self.clock) + + def test_timer_gets_reset(self): + self.subject.flush(force=True) + + self.timer.reset.assert_called_once() + + +class TestWhenMinReportingTimeReached(TestForceFlush): + @before + def before(self): + super().before() + self.move_clock_to(INITIAL_MINIMUM_REPORTING_INTERVAL + ONE_SECOND) + + def test_it_reports(self): + self.mock_reporter.report = MagicMock(return_value=True) + + self.subject.flush(force=True) + + self.mock_reporter.report.assert_called_once() + + def test_profile_gets_reset(self): + self.mock_reporter.report = MagicMock(return_value=True) + + self.subject.flush(force=True) + + assert_profile_is_reset(self.mock_profile_factory, self.clock) + + def test_timer_gets_reset(self): + self.subject.flush(force=True) + + self.timer.reset.assert_called_once() + + def test_it_adds_the_run_profiler_overhead_metric_to_the_profile( + self): + run_profiler_metric = Metric() + run_profiler_metric.add(0.1) + self.timer.get_metric.side_effect = \ + lambda metric_name: metric_name == "runProfiler" and run_profiler_metric + + self.subject.flush(force=True) + + self.mock_profile.set_overhead_ms.assert_called_once_with(duration_timedelta=timedelta(seconds=0.1)) + + def test_when_report_failed_profile_gets_reset(self): + self.mock_reporter.report = MagicMock(return_value=False) + + self.subject.flush(force=True) + + assert_profile_is_reset(self.mock_profile_factory, self.clock) + + +class TestMemoryUsageLimitExceeded(TestLocalAggregator): + def before(self): + super().before() + self.mock_profile.get_memory_usage_bytes = MagicMock( + return_value=DEFAULT_MEMORY_LIMIT_BYTES + 1) + self.sample = Sample([["method1", "method2"]]) + + +class TestLastFlushWasLongerThanMinTimeForReporting(TestMemoryUsageLimitExceeded): + @before + def before(self): + super().before() + self.move_clock_to(INITIAL_MINIMUM_REPORTING_INTERVAL + ONE_SECOND) + self.mock_profile.is_empty = MagicMock(return_value=False) + + def test_it_reports_without_exception(self): + self.mock_reporter.report.return_value = True + self.subject.add(self.sample) + self.mock_reporter.report.assert_called_once_with(self.subject.profile) + + def test_it_updates_last_report_attempted(self): + self.mock_reporter.report.return_value = True + assert (self.subject.last_report_attempted == CURRENT_TIME_FOR_TESTING_MILLI) + self.subject.add(self.sample) + assert (self.subject.last_report_attempted == self.time_now * 1000) + + +class TestLastFlushWasWithinMinTimeForReporting(TestLocalAggregator): + @before + def before(self): + super().before() + self.mock_profile.get_memory_usage_bytes = MagicMock( + return_value=DEFAULT_MEMORY_LIMIT_BYTES + 1) + self.sample = Sample([["method1", "method2"]]) + self.move_clock_to(INITIAL_MINIMUM_REPORTING_INTERVAL - ONE_SECOND) + + def test_exception_raised_when_memory_usage_exceeded(self): + with pytest.raises(OverMemoryLimitException): + self.subject.add(self.sample) diff --git a/test/unit/test_main.py b/test/unit/test_main.py new file mode 100644 index 0000000..0f80669 --- /dev/null +++ b/test/unit/test_main.py @@ -0,0 +1,121 @@ +import pytest +import os +from codeguru_profiler_agent.__main__ import main +from codeguru_profiler_agent.profiler_builder import PG_NAME_ENV, ENABLED_ENV + + +class TestMain: + @pytest.fixture(autouse=True) + def around(self): + # create a script file containing python code + # this script will create a flag file so we can check if it was called correctly + # existence of the flag file proves the script was called + # the script also throws an exception if not passed the expected arguments + self.script_file_name = 'script_file_for_test_main.py' + self.flag_file_name = 'flag_file_for_main_test' + self._write_script_file() + self._clean_flag_file() + yield + self._clean_flag_file() + self._clean_script_file() + + def test_it_starts_given_script_with_script_arguments(self): + args = [self.script_file_name, '--where', 'zoo', 'foo'] + main(input_args=args) + assert self._script_was_called() + + def test_it_starts_given_module_with_script_arguments(self): + args = ['-m', self.script_file_name[:-3], '--where', 'zoo', 'foo'] + main(input_args=args) + assert self._script_was_called() + + def test_it_passes_script_arguments_and_checks_the_values_of_all_arguments(self): + args = [self.script_file_name, 'one', 'two', 'three'] + with pytest.raises(ValueError) as ve: + main(input_args=args) + assert str(ve.value).endswith("'one', 'two', 'three']") + + def test_it_passes_script_arguments_and_checks_the_values_of_all_arguments_when_no_client_arguments_are_set(self): + args = [self.script_file_name] + with pytest.raises(ValueError) as ve: + main(input_args=args) + # Depending on how the unit tests are run, the argument can be: + # - setup.py' if run through an internal build tool or + # - '/../.../pytest' if run with pytest. + assert str(ve.value).endswith("['setup.py']") or str(ve.value).endswith("pytest']") + + def test_it_exits_if_no_arguments(self): + with pytest.raises(SystemExit) as se: + main(input_args=[]) + assert se.value.code == 2 + + def test_it_can_take_profiler_options_as_argument(self): + args = ['-p', 'pg_name', '-r', 'eu-north-1', '-c', 'test-cred-profile', '--log', 'info', + self.script_file_name, '--where', 'zoo', 'foo'] + main(input_args=args, start_profiler=lambda *rargs: True) + assert self._script_was_called() + + def test_it_creates_profiler_with_provided_argument(self): + args = ['-p', 'pg_name', '-r', 'eu-north-1', '-c', 'test-cred-profile', '--log', 'info', + self.script_file_name, '--where', 'zoo', 'foo'] + ran_start_profiler = False + + def mock_start_profiler(options, env): + nonlocal ran_start_profiler + ran_start_profiler = True + assert options.profiling_group_name == 'pg_name' + assert options.region == 'eu-north-1' + assert options.credential_profile == 'test-cred-profile' + assert env == os.environ + + main(args, start_profiler=mock_start_profiler) + assert ran_start_profiler + + def test_it_can_take_profiler_options_from_env(self): + args = [self.script_file_name, '--where', 'zoo', 'foo'] + env = {PG_NAME_ENV: 'pg_name'} + main(input_args=args, env=env) + assert self._script_was_called() + + def test_it_can_run_a_module(self): + args = ['-m', self.script_file_name[:-3], '--where', 'zoo', 'foo'] + main(input_args=args) + assert self._script_was_called() + + def test_it_does_not_create_profiler_if_enabled_is_false(self): + args = ['-p', 'pg_name', self.script_file_name, '--where', 'zoo', 'foo'] + env = {ENABLED_ENV: 'false'} + main(input_args=args, env=env) + from codeguru_profiler_agent.__main__ import profiler + assert profiler is None + + def test_it_creates_profiler_if_enabled_is_true(self): + args = ['-p', 'pg_name', self.script_file_name, '--where', 'zoo', 'foo'] + env = {ENABLED_ENV: 'true'} + main(input_args=args, env=env) + from codeguru_profiler_agent import Profiler + from codeguru_profiler_agent.__main__ import profiler + assert profiler is not None + assert isinstance(profiler, Profiler) + + def _script_was_called(self): + return os.path.exists(self.flag_file_name) + + def _clean_flag_file(self): + if os.path.exists(self.flag_file_name): + os.remove(self.flag_file_name) + + def _write_script_file(self): + script_file = open(self.script_file_name, 'w') + script_file.write("""import sys +if len(sys.argv) != 4: + raise ValueError('Wrong number of arguments, expected 4 arguments, found ' + str(sys.argv)) +if sys.argv[1:4] != ['--where', 'zoo', 'foo']: + raise ValueError('Wrong values of arguments, found ' + str(sys.argv)) +open('{flag_file_name}','w') +""".format(script=self.script_file_name, flag_file_name=self.flag_file_name)) + script_file.close() + + def _clean_script_file(self): + if os.path.exists(self.script_file_name): + os.remove(self.script_file_name) diff --git a/test/unit/test_profiler.py b/test/unit/test_profiler.py new file mode 100644 index 0000000..2417362 --- /dev/null +++ b/test/unit/test_profiler.py @@ -0,0 +1,229 @@ +import pytest +from datetime import timedelta +from mock import Mock +from codeguru_profiler_agent.profiler import Profiler +from codeguru_profiler_agent.profiler_runner import ProfilerRunner + + +def throw_exception(*args, **kwargs): + raise Exception("Exception from TestProfiler") + + +def mock_profiler_runner_factory(profiler_runner): + return lambda environment: profiler_runner + + +class TestProfiler: + class TestInit: + class TestExceptionHandling: + def test_exceptions_are_caught_and_do_not_propagate(self): + Profiler( + profiling_group_name="test-application", + environment_override={ + "profiler_runner_factory": throw_exception + }, + ) + Profiler(profiling_group_name=None) + + class TestReportingModeValidation: + class TestWhenReportingModeIsInvalid: + def test_it_raises_a_value_error(self): + with pytest.raises(ValueError): + Profiler( + profiling_group_name="unit-test", + environment_override={ + "allow_top_level_exceptions": True, + "reporting_mode": "wrong-reporting-mode" + }, + ) + + class TestWhenCustomReportingIntervalIsLessThan30Seconds: + def test_it_does_propagate_a_value_error(self): + environment = {"reporting_interval": timedelta(seconds=29)} + Profiler(profiling_group_name="test-application", environment_override=environment) + + class TestRemovesEnvironmentOverridesForAgentConfiguration: + def test_checks_number_of_params(self): + environment = { + 'reporting_mode': 'codeguru_service', + 'excluded_threads': set(), + 'should_profile': True, + 'sampling_interval': timedelta(seconds=1), + 'reporting_interval': timedelta(minutes=10), + 'minimum_time_reporting': timedelta(minutes=1), + 'max_stack_depth': 1000, + 'cpu_limit_percentage': 10, + 'memory_limit_bytes': 1024, + 'host_weight': 1.0, + 'max_threads': 100 + } + assert len(environment.keys()) == 11 + Profiler(profiling_group_name="test-application", environment_override=environment) + assert len(environment.keys()) == 5 + + class TestStart: + class TestExceptionHandling: + @pytest.fixture(autouse=True) + def around(self): + self.profiler_runner = Mock(spec_set=ProfilerRunner) + self.profiler_runner.is_running = Mock(return_value=False) + self.profiler_runner.start = throw_exception + self.profiler = Profiler( + profiling_group_name="test-application", + environment_override={ + "profiler_runner_factory": mock_profiler_runner_factory(self.profiler_runner) + }) + yield + self.profiler.stop() + + def test_exceptions_are_caught_and_do_not_propagate(self): + assert (not self.profiler.start()) + + class TestWhenAnotherInstanceAlreadyStarted: + @pytest.fixture(autouse=True) + def around(self): + self.profiler_runner = Mock(spec_set=ProfilerRunner) + self.profiler_runner.start = Mock(return_value=True) + self.profiler_runner.is_running = Mock(return_value=False) + self.first_profiler = Profiler( + profiling_group_name="pg_name", + environment_override={ + "profiler_runner_factory": mock_profiler_runner_factory(self.profiler_runner) + }) + self.first_profiler.start() + self.second_profiler = Profiler( + profiling_group_name="test-application", + environment_override={ + "profiler_runner_factory": + mock_profiler_runner_factory(self.profiler_runner) + }) + yield + self.first_profiler.stop() + self.second_profiler.stop() + + def test_it_fails_to_start_a_second_profiler(self): + assert (not self.second_profiler.start()) + + class TestWhenRunnerIsRunning: + @pytest.fixture(autouse=True) + def around(self): + self.profiler_runner = Mock(spec_set=ProfilerRunner) + self.profiler_runner.is_running = Mock(return_value=True) + self.profiler_runner.resume = Mock() + self.profiler = Profiler( + profiling_group_name="test-application", + environment_override={ + "profiler_runner_factory": + mock_profiler_runner_factory(self.profiler_runner) + }) + yield + self.profiler.stop() + + def test_it_returns_true(self): + assert self.profiler.start() + + def test_it_calls_resume(self): + self.profiler.start() + assert self.profiler_runner.resume.called + + class TestStop: + class TestWhenStartWasNotCalled: + @pytest.fixture(autouse=True) + def around(self): + self.profiler = Profiler(profiling_group_name="test-application") + yield + + def test_it_returns_true(self): + assert self.profiler.stop() + + class TestExceptionHandling: + @pytest.fixture(autouse=True) + def around(self): + self.profiler_runner = Mock(spec_set=ProfilerRunner) + self.profiler_runner.stop = throw_exception + self.profiler_runner.is_running = Mock(return_value=False) + self.profiler = Profiler( + profiling_group_name="test-application", + environment_override={ + "profiler_runner_factory": mock_profiler_runner_factory(self.profiler_runner) + }) + yield + self.profiler_runner.stop = Mock(return_value=True) + self.profiler.stop() + + def test_exceptions_are_caught_and_do_not_propagate(self): + self.profiler.start() + assert (not self.profiler.stop()) + + class TestWhenAnotherInstanceAlreadyStarted: + @pytest.fixture(autouse=True) + def around(self): + self.profiler_runner = Mock(spec_set=ProfilerRunner) + self.profiler_runner.start = Mock(return_value=True) + self.profiler_runner.is_running = Mock(return_value=False) + self.first_profiler = Profiler( + profiling_group_name="pg_name", + environment_override={ + "profiler_runner_factory": mock_profiler_runner_factory(self.profiler_runner) + }) + self.first_profiler.start() + self.second_profiler = Profiler( + profiling_group_name="test-application", + environment_override={ + "profiler_runner_factory": mock_profiler_runner_factory(self.profiler_runner) + }) + yield + self.first_profiler.stop() + self.second_profiler.stop() + + def test_stopping_first_instance_allows_next_profiler_to_start(self): + self.first_profiler.stop() + assert self.second_profiler.start() + + class TestPause: + @pytest.fixture(autouse=True) + def around(self): + self.profiler_runner = Mock(spec_set=ProfilerRunner) + self.profiler_runner.pause = Mock() + self.profiler = Profiler( + profiling_group_name="test-application", + environment_override={ + "profiler_runner_factory": mock_profiler_runner_factory(self.profiler_runner) + }) + yield + self.profiler.stop() + + def test_it_returns_true(self): + assert self.profiler.pause() + + def test_it_calls_pause_on_runner(self): + self.profiler.pause() + assert self.profiler_runner.pause.called + + def test_exceptions_are_caught_and_do_not_propagate(self): + self.profiler_runner.pause = throw_exception + assert (not self.profiler.pause()) + + class TestStr: + @pytest.fixture(autouse=True) + def around(self): + self.profiler_runner = Mock(spec_set=ProfilerRunner) + self.session = Mock() + self.session.__repr__ = \ + Mock(return_value="Session(region_name='eu-west-2', profile_name='alternate_credentials')") + self.profiler = Profiler( + profiling_group_name="test-application", + region_name="ap-southeast-1", + aws_session=self.session, + environment_override={ + "sampling_interval": timedelta(seconds=0.01), + "profiler_runner_factory": mock_profiler_runner_factory(self.profiler_runner) + }) + yield + + def test_it_prints_a_subset_of_parameters(self): + assert str(self.profiler) == \ + "Profiler(environment={'max_threads': 100," \ + " 'profiling_group_name': 'test-application'," \ + " 'region_name': 'ap-southeast-1'," \ + " 'aws_session': Session(region_name='eu-west-2', profile_name='alternate_credentials')}" diff --git a/test/unit/test_profiler_builder.py b/test/unit/test_profiler_builder.py new file mode 100644 index 0000000..0917e12 --- /dev/null +++ b/test/unit/test_profiler_builder.py @@ -0,0 +1,157 @@ +import os +import datetime +from unittest.mock import MagicMock, ANY +import boto3 +from codeguru_profiler_agent import Profiler +from codeguru_profiler_agent.profiler_builder import \ + build_profiler, PG_NAME_ENV, PG_ARN_ENV, ENABLED_ENV, REGION_ENV, CREDENTIAL_PATH + + +class TestProfilerBuilder: + class TestWhenProfilingGroupNameParameterIsProvided: + def test_it_creates_a_profiler_and_uses_correct_name(self): + pg_name = "my_profiling_group" + profiler_factory = MagicMock(spec=Profiler) + subject = build_profiler(pg_name=pg_name, env={}, + profiler_factory=profiler_factory) + + assert subject is not None + profiler_factory.assert_called_once_with(profiling_group_name=pg_name, region_name=ANY, + aws_session=ANY, environment_override=ANY) + + class TestWhenProfilingGroupNameIsInEnvironment: + def test_it_creates_a_profiler_and_uses_correct_name(self): + pg_name = "my_profiling_group" + profiler_factory = MagicMock(spec=Profiler) + env = {PG_NAME_ENV: pg_name} + subject = build_profiler(env=env, profiler_factory=profiler_factory) + + assert subject is not None + profiler_factory.assert_called_once_with(profiling_group_name=pg_name, region_name=ANY, + aws_session=ANY, environment_override=ANY) + + class TestWhenProfilingGroupNameIsMissing: + def test_it_returns_none(self): + assert build_profiler(env={}) is None + + class TestWhenRegionNameParameterIsProvided: + def test_it_creates_a_profiler_and_uses_correct_region(self): + region = "eu-north-1" + profiler_factory = MagicMock(spec=Profiler) + subject = build_profiler(pg_name="my_profiling_group", region_name=region, + env={}, profiler_factory=profiler_factory) + assert subject is not None + profiler_factory.assert_called_once_with(profiling_group_name=ANY, region_name=region, + aws_session=ANY, environment_override=ANY) + + class TestWhenCredentialProfileIsProvided: + def test_it_creates_a_profile_rand_uses_correct_credential_profile(self): + credential_profile = "test-profile" + mock_session = MagicMock(name="profiler", spec=boto3.session.Session) + subject = build_profiler(pg_name="my_profiling_group", credential_profile=credential_profile, + env={}, session_factory=mock_session) + + assert subject is not None + mock_session.assert_called_once_with(region_name=ANY, profile_name=credential_profile) + + class TestWhenCredentialProfileIsInEnvironment: + def test_it_creates_a_profiler_and_it_ignores_credential_profile_set_in_env(self): + credential_profile = "test-profile" + previous_credential_value = os.environ.get(CREDENTIAL_PATH) + os.environ[CREDENTIAL_PATH] = credential_profile + mock_session = MagicMock(name="profiler", spec=boto3.session.Session) + subject = build_profiler(pg_name="my_profiling_group", env={}, session_factory=mock_session) + if previous_credential_value is not None: + os.environ[CREDENTIAL_PATH] = previous_credential_value + else: + del os.environ[CREDENTIAL_PATH] + + assert subject is not None + mock_session.assert_called_once_with(region_name=ANY, profile_name=None) + + class TestWhenProfilingGroupArnIsInEnvironment: + def test_it_creates_a_profiler_and_uses_correct_name_and_region(self): + env = {PG_ARN_ENV: 'arn:aws:codeguru-profiler:us-west-2:003713371902:profilingGroup/name_from_arn'} + profiler = build_profiler(env=env) + assert profiler is not None + assert profiler._profiler_runner.collector.reporter.profiling_group_name == 'name_from_arn' + assert 'us-west-2' in profiler._profiler_runner.collector.reporter.codeguru_client_builder.codeguru_client._endpoint.host + + def test_it_creates_a_profiler_and_uses_correct_region_from_arn(self): + env = {PG_ARN_ENV: 'arn:aws:codeguru-profiler:us-west-2:003713371902:profilingGroup/name_from_arn', + REGION_ENV: 'eu-north-1'} + profiler = build_profiler(env=env) + assert profiler is not None + assert 'us-west-2' in profiler._profiler_runner.collector.reporter.codeguru_client_builder.codeguru_client._endpoint.host + + class TestWhenProfilingGroupNameIsAlsoProvided: + def test_it_uses_name_from_arn(self): + env = {PG_ARN_ENV: 'arn:aws:codeguru-profiler:us-west-2:003713371902:profilingGroup/name_from_arn', + PG_NAME_ENV: 'different_name_here'} + assert build_profiler(env=env) \ + ._profiler_runner.collector.reporter.profiling_group_name == 'name_from_arn' + + class TestWhenProfilingGroupArnIsMalformed: + def test_it_creates_profiler_using_the_name_and_region_from_env(self): + env = {PG_ARN_ENV: 'this_arn_is_definitely_not_correct', + PG_NAME_ENV: 'but_this_name_should_do', + REGION_ENV: 'eu-north-1'} + profiler = build_profiler(env=env) + assert profiler._profiler_runner.collector.reporter.profiling_group_name == 'but_this_name_should_do' + assert 'eu-north-1' in profiler._profiler_runner.collector.reporter.codeguru_client_builder.codeguru_client._endpoint.host + + class TestWhenRegionNameIsInEnvironment: + def test_it_creates_a_profiler_and_uses_correct_region(self): + region = "eu-north-1" + env = {REGION_ENV: region} + profiler_factory = MagicMock(spec=Profiler) + subject = build_profiler(pg_name="my_profiling_group", env=env, + profiler_factory=profiler_factory) + assert subject is not None + profiler_factory.assert_called_once_with(profiling_group_name=ANY, region_name=region, + aws_session=ANY, environment_override=ANY) + + class TestWhenSamplingIntervalIsInEnvironment: + def test_it_uses_correct_sampling_interval(self): + env = {"AWS_CODEGURU_PROFILER_SAMPLING_INTERVAL_MS": "100"} + profiler_factory = MagicMock(spec=Profiler) + subject = build_profiler(pg_name="my_profiling_group", env=env, + profiler_factory=profiler_factory) + + profiler_factory.assert_called_once_with(profiling_group_name=ANY, region_name=ANY, aws_session=ANY, + environment_override={"sampling_interval": datetime.timedelta( + milliseconds=100)}) + + class TestWhenReportingIntervalIsInEnvironment: + def test_it_uses_correct_sampling_interval(self): + env = {"AWS_CODEGURU_PROFILER_REPORTING_INTERVAL_MS": "60000"} + profiler_factory = MagicMock(spec=Profiler) + subject = build_profiler(pg_name="my_profiling_group", env=env, + profiler_factory=profiler_factory) + profiler_factory.assert_called_once_with(profiling_group_name=ANY, region_name=ANY, aws_session=ANY, + environment_override={ + "reporting_interval": datetime.timedelta(seconds=60)}) + + class TestWhenMalformedReportingIntervalIsInEnvironment: + def test_it_still_creates_a_profiler(self): + env = {"AWS_CODEGURU_PROFILER_REPORTING_INTERVAL_MS": "12.5"} + profiler_factory = MagicMock(spec=Profiler) + subject = build_profiler(pg_name="my_profiling_group", env=env, + profiler_factory=profiler_factory) + assert subject is not None + + class TestWhenProfilingIsDisabled: + def test_it_returns_none(self): + for value in ["False", "false", "0", "Nope", "Yes" "anything"]: + env = {ENABLED_ENV: value} + assert build_profiler(pg_name="my_profiling_group", env=env) is None + + class TestWhenProfilingIsEnabled: + def test_it_builds_the_profiler(self): + for value in ["True", "true"]: + env = {ENABLED_ENV: value} + assert build_profiler(pg_name="my_profiling_group", env=env) is not None + + class TestWhenUnknownExceptionIsThrown: + def test_it_returns_none(self): + assert build_profiler(pg_name="my_profiling_group", env="not_a_dictionary") is None diff --git a/test/unit/test_profiler_disabler.py b/test/unit/test_profiler_disabler.py new file mode 100644 index 0000000..638aa68 --- /dev/null +++ b/test/unit/test_profiler_disabler.py @@ -0,0 +1,247 @@ +import tempfile +from pathlib import Path +import shutil +from mock import Mock, patch +import time + +from codeguru_profiler_agent.model.profile import Profile +from test.pytestutils import before +from datetime import timedelta + +from codeguru_profiler_agent.profiler import DEFAULT_MEMORY_LIMIT_BYTES, DEFAULT_CPU_LIMIT_PERCENTAGE +from codeguru_profiler_agent.profiler_disabler import KillSwitch, CpuUsageCheck, ProfilerDisabler, \ + MINIMUM_SAMPLES_IN_PROFILE +from codeguru_profiler_agent.metrics.timer import Timer +from codeguru_profiler_agent.reporter.agent_configuration import AgentConfiguration + + +def set_agent_config(sampling_interval_seconds=1, cpu_limit_percentage=DEFAULT_CPU_LIMIT_PERCENTAGE): + """ + Reporting interval needs to have a minimum value and cpu_limit_percentage is used later in tests; + the other values can be None as we are not using them here. + """ + return AgentConfiguration.set( + AgentConfiguration(sampling_interval=timedelta(seconds=sampling_interval_seconds), + reporting_interval=timedelta(seconds=300), + minimum_time_reporting=timedelta(seconds=60), + cpu_limit_percentage=cpu_limit_percentage)) + + +def assert_config_sampling_interval_used(process_duration_check, profile): + assert process_duration_check.is_cpu_usage_limit_reached(profile) + + set_agent_config(sampling_interval_seconds=42, cpu_limit_percentage=80) + assert not process_duration_check.is_cpu_usage_limit_reached(profile) + + +class TestProfilerDisabler: + def before(self): + self.timer = Timer() + self.env = { + 'timer': self.timer, + 'killswitch_filepath': 'path_to_my_kill_switch', + 'memory_limit_bytes': DEFAULT_MEMORY_LIMIT_BYTES + } + self.disabler = ProfilerDisabler(self.env) + + +class TestSetupParameters(TestProfilerDisabler): + @before + def before(self): + super().before() + + def test_it_sets_all_parameters(self): + self.env['memory_limit_bytes'] = 42 + self.disabler = ProfilerDisabler(self.env) + assert self.disabler.memory_limit_bytes == 42 + assert self.disabler.killswitch.killswitch_filepath == 'path_to_my_kill_switch' + assert self.disabler.cpu_usage_check.timer == self.timer + assert AgentConfiguration.get().cpu_limit_percentage == DEFAULT_CPU_LIMIT_PERCENTAGE + + +class TestWhenAnyFails(TestProfilerDisabler): + @before + def before(self): + super().before() + self.profiler = Mock() + self.disabler.killswitch = Mock() + self.disabler.cpu_usage_check = Mock() + self.disabler._is_memory_limit_reached = Mock(return_value=False) + self.disabler.killswitch.is_killswitch_on = Mock(return_value=False) + self.disabler.killswitch.is_process_duration_limit_reached = Mock(return_value=False) + + def test_it_stops_profiling_if_killswitch_is_on(self): + self.disabler.killswitch.is_killswitch_on = Mock(return_value=True) + assert self.disabler.should_stop_profiling(self.profiler) + + def test_it_stops_profiling_if_memory_limit_is_reached(self): + self.disabler._is_memory_limit_reached = Mock(return_value=True) + assert self.disabler.should_stop_profiling(self.profiler) + + def test_it_stops_profiling_if_process_duration_is_reached(self): + self.disabler.cpu_usage_check.is_cpu_usage_limit_reached = Mock(return_value=True) + assert self.disabler.should_stop_profiling(self.profiler) + + +class TestKillSwitch: + class TestWhenKillSwitchFileExists: + @before + def before(self): + self.temporary_directory = tempfile.mkdtemp() + self.temp_filepath = str( + Path(self.temporary_directory, + 'test_killswitch_unit_test')) + Path(self.temp_filepath).touch() + yield + shutil.rmtree(self.temporary_directory) + + def test_it_returns_true(self): + assert KillSwitch(self.temp_filepath).is_killswitch_on() + + class TestWhenKillSwitchFileDoesNotExists: + def test_it_returns_false(self): + assert not KillSwitch("this_file_does_not_exists_in_the_file_system_or_i_am_unlucky").is_killswitch_on() + + +class TestWhenKillSwitchFileIsAdded: + @before + def before(self): + self.temporary_directory = tempfile.mkdtemp() + self.temp_filepath = str( + Path(self.temporary_directory, + 'test_killswitch_unit_test')) + self.current_time = time.time() + self.killswitch = KillSwitch(self.temp_filepath, clock=lambda: self.current_time) + self.killswitch.is_killswitch_on() + Path(self.temp_filepath).touch() + yield + shutil.rmtree(self.temporary_directory) + + def test_it_still_returns_false_for_less_than_a_minute(self): + self.current_time = self.current_time + 10 + assert not self.killswitch.is_killswitch_on() + + def test_it_returns_true_after_a_minute(self): + self.current_time = self.current_time + 61 + assert self.killswitch.is_killswitch_on() + + +class TestWhenKillSwitchFileIsRemoved: + @before + def before(self): + self.temporary_directory = tempfile.mkdtemp() + self.temp_filepath = str( + Path(self.temporary_directory, + 'test_killswitch_unit_test')) + self.current_time = time.time() + self.killswitch = KillSwitch(self.temp_filepath, clock=lambda: self.current_time) + Path(self.temp_filepath).touch() + self.killswitch.is_killswitch_on() + shutil.rmtree(self.temporary_directory) + yield + + def test_it_returns_false_after_a_minute(self): + self.current_time = self.current_time + 61 + assert not self.killswitch.is_killswitch_on() + + +class TestCpuUsageCheck: + def before(self): + self.timer = Timer() + self.profile = Mock(spec=Profile) + for i in range(20): + self.timer.record('runProfiler', 0.5) + set_agent_config(sampling_interval_seconds=1, cpu_limit_percentage=10) + self.process_duration_check = CpuUsageCheck(self.timer) + + +class TestGetAverageSamplingIntervalSeconds(TestCpuUsageCheck): + @before + def before(self): + super().before() + set_agent_config(sampling_interval_seconds=23) + self.profile.get_active_millis_since_start = Mock(return_value=40000) + self.profile.total_sample_count = 10 + + def test_it_returns_average_sampling_interval_seconds_based_on_profile(self): + # sampling interval seconds = (40000/10)/1000 = 4 + assert CpuUsageCheck._get_average_sampling_interval_seconds(self.profile) == 4 + + def test_when_profile_is_none_it_returns_configuration_sampling_interval(self): + assert CpuUsageCheck._get_average_sampling_interval_seconds(None) == 23 + + def test_when_profiler_sample_count_less_than_min_samples_in_profile_it_returns_configuration_sampling_interval( + self): + self.profile.total_sample_count = MINIMUM_SAMPLES_IN_PROFILE - 1 + assert CpuUsageCheck._get_average_sampling_interval_seconds(self.profile) == 23 + + +class TestIsCpuUsageLimitReached(TestCpuUsageCheck): + @before + def before(self): + super().before() + with patch( + "codeguru_profiler_agent.profiler_disabler.CpuUsageCheck._get_average_sampling_interval_seconds", + return_value=4) as m: + self.get_average_sampling_interval_mock = m + yield + + def test_it_calls_get_average_sampling_interval_with_profile(self): + self.process_duration_check.is_cpu_usage_limit_reached(self.profile) + self.get_average_sampling_interval_mock.assert_called_once_with(self.profile) + + def test_when_average_duration_exceeds_limit_it_returns_true(self): + # timer: (0.5/4) * 100= 12.5% + assert self.process_duration_check.is_cpu_usage_limit_reached() + + def test_when_average_duragtion_is_below_limit_it_returns_false(self): + # timer: (0.5/4) * 100= 12.5% + set_agent_config(cpu_limit_percentage=13) + assert not self.process_duration_check.is_cpu_usage_limit_reached() + + def test_when_profile_is_none_it_calls_get_average_sampling_interval_without_profile(self): + self.process_duration_check.is_cpu_usage_limit_reached() + self.get_average_sampling_interval_mock.assert_called_once_with(None) + + +class TestWhenTimerDoesNotHaveTheKey(TestCpuUsageCheck): + @before + def before(self): + super().before() + + def test_it_returns_false(self): + self.process_duration_check.timer = Timer() + assert not self.process_duration_check.is_cpu_usage_limit_reached() + + +class TestWhenTimerDoesNotHaveEnoughMeasures(TestCpuUsageCheck): + @before + def before(self): + super().before() + + def test_it_returns_false(self): + self.timer.reset() + for i in range(4): + self.timer.record('runProfiler', 0.5) + assert not self.process_duration_check.is_cpu_usage_limit_reached() + + +class TestMemoryLimitCheck: + @before + def before(self): + self.env = { + 'timer': None, + 'killswitch_filepath': '', + 'memory_limit_bytes': 10 * 1024 * 1024 + } + self.disabler = ProfilerDisabler(self.env) + self.profile = Mock() + self.profile.get_memory_usage_bytes = Mock(return_value=10 * 1024 * 1024 + 1) + self.disabler = ProfilerDisabler(self.env) + + def test_when_memory_usage_exceeds_limit_it_returns_true(self): + assert self.disabler._is_memory_limit_reached(self.profile) + + def test_when_memory_usage_is_below_limit_it_returns_false(self): + self.profile.get_memory_usage_bytes = Mock(return_value=10 * 1024 * 1024 - 1) + assert not self.disabler._is_memory_limit_reached(self.profile) diff --git a/test/unit/test_profiler_runner.py b/test/unit/test_profiler_runner.py new file mode 100644 index 0000000..f63d883 --- /dev/null +++ b/test/unit/test_profiler_runner.py @@ -0,0 +1,102 @@ +from datetime import timedelta +import time +from codeguru_profiler_agent.reporter.agent_configuration import AgentConfiguration +from test.pytestutils import before +from mock import MagicMock + +from codeguru_profiler_agent.profiler_runner import ProfilerRunner +from codeguru_profiler_agent.profiler_disabler import ProfilerDisabler +from codeguru_profiler_agent.local_aggregator import LocalAggregator +from codeguru_profiler_agent.sampler import Sampler + + +class TestProfilerRunner: + @before + def before(self): + self.mock_collector = MagicMock(name="collector", spec=LocalAggregator) + self.mock_disabler = MagicMock(name="profile", spec=ProfilerDisabler) + self.mock_disabler.should_stop_profiling.return_value = False + self.mock_sampler = MagicMock(name="sampler", spec=Sampler) + + self.environment = { + "collector": self.mock_collector, + "profiler_disabler": self.mock_disabler, + "sampler": self.mock_sampler, + "initial_sampling_interval": timedelta(), + "profiler_thread_name": "codeguru-profiler-agent-TestProfilerRunner" + } + + self.agent_configuration = AgentConfiguration(should_profile=True, + sampling_interval=timedelta(seconds=2), + reporting_interval=timedelta(seconds=100)) + + # mock the collector's refresh_configuration function to actual set agent configuration singleton + def set_new_configuration(): + AgentConfiguration.set(self.agent_configuration) + + self.mock_collector.refresh_configuration.side_effect = set_new_configuration + + # mock the collector's flush function to return True or False according to self.is_time_to_report + self.is_time_to_report = False + self.mock_collector.flush.side_effect = lambda *args, **kwargs: self.is_time_to_report + self.mock_collector.profile = None # we need this as we pass the profile object to the disabler, None is fine + self.profiler_runner = ProfilerRunner(self.environment) + + yield + self.profiler_runner.stop() + + def test_when_runner_executes_for_the_first_time(self): + self.profiler_runner._profiling_command() + self.mock_collector.refresh_configuration.assert_called_once() + self.mock_collector.add.assert_called_once() + self.mock_collector.setup.assert_called_once() + + def test_when_runner_executes_for_the_second_time(self): + self.profiler_runner._profiling_command() + self.mock_collector.reset_mock() + + self.profiler_runner._profiling_command() + + self.mock_collector.add.assert_called_once() + self.mock_collector.refresh_configuration.assert_not_called() + self.mock_collector.flush.assert_called_once() + self.mock_collector.setup.assert_not_called() + + def test_when_it_is_time_to_report_it_refreshes_configuration_again(self): + self.is_time_to_report = True + # this will call flush which will return True, so we simulate we have reported, + # next call should refresh configuration + self.profiler_runner._profiling_command() + self.mock_collector.reset_mock() + + self.profiler_runner._profiling_command() + self.mock_collector.refresh_configuration.assert_called_once() + + def test_when_disabler_say_to_stop(self): + self.mock_disabler.should_stop_profiling.return_value = True + self.profiler_runner._profiling_command() + + self.mock_collector.refresh_configuration.assert_not_called() + self.mock_collector.add.assert_not_called() + + def test_when_orchestrator_says_no_to_profiler(self): + self.agent_configuration = AgentConfiguration(should_profile=False, + sampling_interval=timedelta(seconds=2), + reporting_interval=timedelta(seconds=150)) + # calling start in this test, it will start the scheduler and because initial delay is 0 it will execute now + self.profiler_runner.start() + # still it is safer to wait until the new config has been applied + wait_until(lambda: AgentConfiguration.get().should_profile == False) + + assert self.profiler_runner.scheduler._get_next_delay_seconds() == 150 + self.mock_collector.add.assert_not_called() + + +def wait_until(predicate, max_wait_seconds=1, period_seconds=0.1): + start = time.time() + timeout = start + max_wait_seconds + while time.time() < timeout: + if predicate(): + return True + time.sleep(period_seconds) + raise AssertionError("Predicate was never true after waiting for " + str(max_wait_seconds) + " seconds") diff --git a/test/unit/test_sampler.py b/test/unit/test_sampler.py new file mode 100644 index 0000000..9a8d41f --- /dev/null +++ b/test/unit/test_sampler.py @@ -0,0 +1,144 @@ +from codeguru_profiler_agent.reporter.agent_configuration import AgentConfiguration +from test.pytestutils import before +import mock +from mock import create_autospec, MagicMock, ANY + +from codeguru_profiler_agent.sampler import Sampler +from codeguru_profiler_agent.sampling_utils import get_stacks + + +class TestSampler: + def before(self): + AgentConfiguration.set(AgentConfiguration._get_new_config(configure_agent_response={ + "agentParameters": { + "SamplingIntervalInMilliseconds": "2700", + "MinimumTimeForReportingInMilliseconds": "60000", + "MaxStackDepth": "1000" + }, + "periodInSeconds": 123 + })) + self.mock_get_stacks = create_autospec(get_stacks) + self.mock_thread_lister = MagicMock(name="thread_lister") + self._current_frames_reply = { + "fake_thread_1": "fake_thread_frames_1", + "fake_thread_2": "fake_thread_frames_2" + } + self.mock_thread_lister._current_frames = \ + MagicMock(name="current_frames_list", return_value=self._current_frames_reply) + self.environment = { + "get_stacks": self.mock_get_stacks, + "thread_lister": self.mock_thread_lister, + } + + +class TestSample(TestSampler): + @before + def before(self): + super().before() + + def test_it_includes_the_number_of_threads_it_attempted_to_sample_and_how_many_in_total_were_seen_in_the_system( + self): + self.mock_get_stacks.result = [["dummy_stack_sample"]] + sampler = Sampler(environment=self.environment) + + result = sampler.sample() + + assert (result.attempted_sample_threads_count == 2) + assert (result.seen_threads_count == 2) + + def test_it_calls_the_get_stacks_method_with_the_current_threads_and_the_default_excluded_threads_and_default_max_depth( + self): + default_excluded_threads = set() + default_max_depth = 1000 + + sampler = Sampler(environment=self.environment) + + sampler.sample() + + self.mock_get_stacks.assert_called_once_with( + threads_to_sample=self._current_frames_reply.items(), + excluded_threads=default_excluded_threads, + max_depth=default_max_depth, + ) + + +class TestWhenThereAreMoreThreadsThanMaxThreads(TestSampler): + @before + def before(self): + super().before() + self.environment["max_threads"] = 1 + self.subject = Sampler(environment=self.environment) + + def test_it_calls_the_get_stacks_method_with_a_subset_of_the_threads( + self): + self.subject.sample() + + allowed_results = [ + mock.call( + threads_to_sample=[("fake_thread_1", + "fake_thread_frames_1")], + excluded_threads=ANY, + max_depth=ANY, + ), + mock.call( + threads_to_sample=[("fake_thread_2", + "fake_thread_frames_2")], + excluded_threads=ANY, + max_depth=ANY, + ), + ] + + assert (self.mock_get_stacks.call_args in allowed_results) + + def test_it_includes_the_number_of_threads_it_attempted_to_sample_and_how_many_in_total_were_seen_in_the_system( + self): + self.mock_get_stacks.result = [["dummy_stack_sample"]] + + result = self.subject.sample() + + assert (result.attempted_sample_threads_count == 1) + assert (result.seen_threads_count == 2) + + +class TestWhenACustomStackDepthLimitIsSpecified(TestSampler): + @before + def before(self): + super().before() + + def test_it_calls_the_get_stacks_method_with_the_custom_max_stack_depth( + self): + AgentConfiguration.set(AgentConfiguration._get_new_config(configure_agent_response={ + "agentParameters": { + "SamplingIntervalInMilliseconds": "2700", + "MinimumTimeForReportingInMilliseconds": "60000", + "MaxStackDepth": "10" + }, + "periodInSeconds": 123 + })) + sampler = Sampler(environment=self.environment) + + sampler.sample() + + self.mock_get_stacks.assert_called_once_with( + threads_to_sample=ANY, + excluded_threads=ANY, + max_depth=10, + ) + + +class TestWhenExcludedThreadsAreSpecified(TestSampler): + @before + def before(self): + super().before() + + def test_it_calls_the_get_stacks_method_with_the_custom_excluded_threads_list( + self): + self.environment["excluded_threads"] = {"exclude_me"} + sampler = Sampler(environment=self.environment) + + sampler.sample() + + self.mock_get_stacks.assert_called_once_with( + threads_to_sample=ANY, + excluded_threads={"exclude_me"}, + max_depth=ANY) diff --git a/test/unit/test_sampling_utils.py b/test/unit/test_sampling_utils.py new file mode 100644 index 0000000..015dcfc --- /dev/null +++ b/test/unit/test_sampling_utils.py @@ -0,0 +1,90 @@ +import pytest +from mock import mock +import sys + +from test import help_utils + +from codeguru_profiler_agent.sampling_utils import get_stacks + +DEFAULT_TRUNCATED_FRAME_NAME = "" + + +def is_frame_in_stacks(stacks, target_frame): + for stack in stacks: + for frame in stack: + if target_frame in frame.name: + return True + return False + + +def assert_frames_in_stack_are_in_expected_order(stacks, parent_frame, child_frame): + for stack in stacks: + for i in range(len(stack)): + if parent_frame in stack[i].name: + assert child_frame in stack[i + 1].name + return + assert False + + +class TestSamplingUtils: + class TestGetStacks: + @pytest.fixture(autouse=True) + def around(self): + self.helper = help_utils.HelperThreadRunner() + self.helper.new_helper_thread_blocked_inside_dummy_method() + yield + self.helper.stop_helper_thread() + + def test_it_adds_time_sleep_frame(self): + # Run a sleeping thread for 1 second so get_stack should capture it; it will not block the test and it + # dies after a second. + self.helper.new_helper_sleep_thread(sleep_sec=1) + stacks = get_stacks( + threads_to_sample=sys._current_frames().items(), + excluded_threads=set(), + max_depth=100) + + assert is_frame_in_stacks(stacks, target_frame="") + + def test_it_returns_a_list_of_lists_containing_stack_frames(self): + stacks = get_stacks( + threads_to_sample=sys._current_frames().items(), + excluded_threads=set(), + max_depth=100) + + assert_frames_in_stack_are_in_expected_order( + stacks, "dummy_parent_method", + "dummy_method") + + def test_it_truncates_stacks_deeper_than_max_depth(self): + stacks = get_stacks( + threads_to_sample=sys._current_frames().items(), + excluded_threads=set(), + max_depth=2) + + for stack in stacks: + assert len(stack) == 2 + assert stack[-1].name == DEFAULT_TRUNCATED_FRAME_NAME + + def test_it_excludes_the_given_threads_from_the_output(self): + stacks = get_stacks( + threads_to_sample=sys._current_frames().items(), + excluded_threads=set(["test-thread"]), + max_depth=100) + + assert not is_frame_in_stacks(stacks, "dummy_parent_method") + + def test_it_does_not_include_zombie_threads(self): + with mock.patch( + "codeguru_profiler_agent.sampling_utils._is_zombie", + side_effect= + lambda thread: True if thread.name == "test-thread" else False + ): + + stacks = get_stacks( + threads_to_sample=sys._current_frames().items(), + excluded_threads=set(), + max_depth=100) + + assert not is_frame_in_stacks( + stacks, "dummy_parent_method") diff --git a/test/unit/utils/test_execution_state.py b/test/unit/utils/test_execution_state.py new file mode 100644 index 0000000..886135d --- /dev/null +++ b/test/unit/utils/test_execution_state.py @@ -0,0 +1,319 @@ +import threading +import time +import datetime +from test.pytestutils import before +from queue import Queue, Empty +from mock import MagicMock, call + +from codeguru_profiler_agent.utils.execution_state import ExecutionState + +# this is to make sure the unit test does not run forever if there is a bug in the scheduler +TEST_TIMEOUT_SECONDS = 5 + + +class ExecutorForTest: + def __init__(self, execution_state): + self.execution_state = execution_state + self.thread = \ + threading.Thread(target=self.execute, name="from_test_execution_state") + self.execution_counter = 0 + self.timeout = TEST_TIMEOUT_SECONDS + self.start_time = time.time() + self.thread.start() + + def execute(self): + while (not self.execution_state.is_stopped() + and time.time() < self.start_time + self.timeout): + self.execution_state.wait_for_next_tick_or_stop() + if not self.execution_state.is_stopped(): + self.execution_counter += 1 + + +class TestPause: + @before + def before(self): + self.state = ExecutionState() + self.executor = ExecutorForTest(self.state) + yield + self.state.signal_stop() + + def test_is_paused(self): + self.state.signal_pause(block=True) + assert self.state.is_paused() + + def test_when_resumed_it_continues_to_run(self): + self.state.signal_resume(block=True) + assert not self.state.is_paused() + assert not self.state.is_stopped() + + def test_when_it_is_paused_it_resumes(self): + self.state.signal_pause(block=True) + + self.state.signal_resume(block=True) + assert not self.state.is_paused() + assert not self.state.is_stopped() + + def test_it_resumes_after_pausing_several_times(self): + self.state.signal_pause(block=True) + self.state.signal_pause(block=True) + + self.state.signal_resume(block=True) + assert not self.state.is_paused() + assert not self.state.is_stopped() + + +class TestStop: + @before + def before(self): + self.state = ExecutionState() + self.executor = ExecutorForTest(self.state) + + def test_is_stopped(self): + self.state.signal_stop(block=True) + assert self.state.is_stopped() + + +class TestDelay: + def before(self): + self.initial_delay = datetime.timedelta(seconds=1) + self.default_delay = datetime.timedelta(seconds=2) + self.mock_queue = MagicMock(name="queue", spec=Queue) + self.state = ExecutionState( + delay_provider=lambda: self.default_delay, + initial_delay=self.initial_delay, + state_changes_queue=self.mock_queue, + clock=lambda: 1) + # mock queue so that we instantly return Empty without waiting for timeout + self.mock_queue.get.side_effect = Empty + + +class TestDelayFirstExecution(TestDelay): + @before + def before(self): + super().before() + + def test_we_waited_for_initial_delay(self): + is_time_to_execute = self.state.wait_for_next_tick_or_stop() + self.mock_queue.get.assert_called_once_with( + block=True, timeout=self.initial_delay.total_seconds()) + assert is_time_to_execute + + def test_when_delay_changed_after_first_execution_we_then_waited_for_default_delay(self): + self.default_delay = datetime.timedelta(milliseconds=2700) + self.state.wait_for_next_tick_or_stop() + self.mock_queue.get.reset_mock() + is_time_to_execute = self.state.wait_for_next_tick_or_stop() + self.mock_queue.get.assert_called_once_with(block=True, timeout=2.7) + assert is_time_to_execute + + +class TestDelayChangedDuringAnExecution(TestDelay): + @before + def before(self): + super().before() + self.default_delay = datetime.timedelta(milliseconds=2700) + + # wait for first execution, initial_delay has elapsed + self.state.wait_for_next_tick_or_stop() + self.mock_queue.get.reset_mock() + + # wait for second execution, normal delay has elapsed + self.state.wait_for_next_tick_or_stop() + self.mock_queue.get.reset_mock() + + # now delay changes (e.g. new configuration from orchestrator) + self.default_delay = datetime.timedelta(milliseconds=1234) + + def test_we_then_waited_for_new_delay(self): + is_time_to_execute = self.state.wait_for_next_tick_or_stop() + self.mock_queue.get.assert_called_once_with(block=True, timeout=1.234) + assert is_time_to_execute + + +class TestDelayProviderChangedDuringAnExecution(TestDelay): + @before + def before(self): + super().before() + self.default_delay = datetime.timedelta(milliseconds=2700) + + # wait for first execution, initial_delay has elapsed + self.state.wait_for_next_tick_or_stop() + self.mock_queue.get.reset_mock() + + # wait for second execution, normal delay has elapsed + self.state.wait_for_next_tick_or_stop() + self.mock_queue.get.reset_mock() + + # now delay provider changes (e.g. profiler_runner changes the scheduler config) + self.state.delay_provider = lambda: datetime.timedelta(milliseconds=1234) + + def test_we_then_waited_for_new_delay(self): + is_time_to_execute = self.state.wait_for_next_tick_or_stop() + self.mock_queue.get.assert_called_once_with(block=True, timeout=1.234) + assert is_time_to_execute + + +class TestPausedTime: + @before + def before(self): + self.initial_delay = datetime.timedelta(seconds=10) + self.default_delay = datetime.timedelta(seconds=20) + self.first_pause_time = 3 + self.first_resume_time = 8 + self.mock_queue = MagicMock(name="queue", spec=Queue) + self.mock_clock = MagicMock(name="clock", spec=time.time) + self.mock_clock.side_effect = [ + 0, self.first_pause_time, self.first_resume_time + ] + self.state = ExecutionState( + delay_provider=lambda: self.default_delay, + initial_delay=self.initial_delay, + state_changes_queue=self.mock_queue, + clock=self.mock_clock) + # mock queue so that we simulate a pause command at 3 o'clock + # then a resume command at 8 o'clock and then no change. + # With initial delay being 10, we should wait for 7 more after resume. + self.mock_queue.get.side_effect = \ + [ + { + "state": ExecutionState.PAUSED, + "current_time": self.first_pause_time + }, + { + "state": ExecutionState.RUNNING, + "current_time": self.first_resume_time + }, + Empty + ] + + def test_when_paused_during_initial_delay_we_waited_for_remaining_time_after_resume(self): + is_time_to_execute = self.state.wait_for_next_tick_or_stop() + self.mock_queue.get.assert_called_with( + block=True, + timeout=7) + assert is_time_to_execute + + def test_when_paused_again_after_first_execution_we_waited_for_correct_remaining_time_after_second_resume(self): + self.initial_execution_time = 15 + self.second_pause_time = 19 + self.second_resume_time = 23 + self.mock_queue.get.side_effect = \ + [ + { + "state": ExecutionState.PAUSED, + "current_time": self.first_pause_time + }, + { + "state": ExecutionState.RUNNING, + "current_time": self.first_resume_time + }, + Empty, + { + "state": ExecutionState.PAUSED, + "current_time": self.second_pause_time + }, + { + "state": ExecutionState.RUNNING, + "current_time": self.second_resume_time + }, + Empty + ] + self.mock_clock.side_effect = [ + 0, self.first_pause_time, self.first_resume_time, self.initial_execution_time, + self.second_pause_time, self.second_resume_time + ] + + # first wait until we execute at time 15s + self.state.wait_for_next_tick_or_stop() + + # second call should wait for 20s for normal delay, then wait for 15s remaining after resume + is_time_to_execute = self.state.wait_for_next_tick_or_stop() + self.mock_queue.get.assert_has_calls([ + call(block=True, timeout=10), # wanted to wait for 10s, the initial delay + call(block=True), # after pause at 3s, wait until next resume + call(block=True, timeout=7), # after resume at 8s wait for 7s remaining from initial delay + call(block=True, timeout=20), # at 15s, wanted to wait for 20s, the normal delay + call(block=True), # after pause at 19s, wait until next resume + call(block=True, timeout=16)]) # after resume at 23s wait for 6s remaining from initial delay + assert is_time_to_execute + + def test_when_paused_multiple_times_in_initial_delay_the_wait_time_accumulates(self): + self.first_pause_time = 3 + self.first_resume_time = 8 + self.second_pause_time = 9 + self.second_resume_time = 10 + self.mock_queue.get.side_effect = \ + [ + { + "state": ExecutionState.PAUSED, + "current_time": self.first_pause_time + }, + { + "state": ExecutionState.RUNNING, + "current_time": self.first_resume_time + }, + { + "state": ExecutionState.PAUSED, + "current_time": self.second_pause_time + }, + { + "state": ExecutionState.RUNNING, + "current_time": self.second_resume_time + }, + Empty + ] + self.mock_clock.side_effect = [ + 0, self.first_pause_time, self.first_resume_time, self.second_pause_time, self.second_resume_time + ] + + is_time_to_execute = self.state.wait_for_next_tick_or_stop() + self.mock_queue.get.assert_has_calls([ + call(block=True, timeout=10), # wanted to wait for 10s, the initial delay + call(block=True), # after pause at 3s, wait until next resume + call(block=True, timeout=7), # after resume at 8s wait for 7s remaining from initial delay + call(block=True), # after pause at 9s, wait until next resume + call(block=True, timeout=6)]) # after resume at 10s wait for 6s remaining from initial delay + assert is_time_to_execute + + def test_when_paused_multiple_times_in_normal_delay_the_wait_time_accumulates_the_wait_time_accumulates(self): + self.first_pause_time = 13 + self.first_resume_time = 18 + self.second_pause_time = 19 + self.second_resume_time = 20 + self.mock_queue.get.side_effect = \ + [ + Empty, + { + "state": ExecutionState.PAUSED, + "current_time": self.first_pause_time + }, + { + "state": ExecutionState.RUNNING, + "current_time": self.first_resume_time + }, + { + "state": ExecutionState.PAUSED, + "current_time": self.second_pause_time + }, + { + "state": ExecutionState.RUNNING, + "current_time": self.second_resume_time + }, + Empty + ] + self.mock_clock.side_effect = [ + 0, 10, self.first_pause_time, self.first_resume_time, self.second_pause_time, self.second_resume_time + ] + + # first wait until we execute after 10s, the initial delay + self.state.wait_for_next_tick_or_stop() + # then wait again for the normal delay and receive multiple pause and resume calls + is_time_to_execute = self.state.wait_for_next_tick_or_stop() + self.mock_queue.get.assert_has_calls([ + call(block=True, timeout=10), # wait for 10s, the initial delay + call(block=True, timeout=20), # wanted to wait for 20s, the normal delay + call(block=True), # after pause at 13s, wait until next resume + call(block=True, timeout=17), # after resume at 18s wait for 17s remaining from initial delay + call(block=True), # after pause at 19s, wait until next resume + call(block=True, timeout=16)]) # after resume at 10s wait for 16s remaining from initial delay + assert is_time_to_execute diff --git a/test/unit/utils/test_scheduler.py b/test/unit/utils/test_scheduler.py new file mode 100644 index 0000000..8537ce2 --- /dev/null +++ b/test/unit/utils/test_scheduler.py @@ -0,0 +1,94 @@ +from queue import Queue +import pytest +from test.pytestutils import before +from codeguru_profiler_agent.utils.scheduler import Scheduler + +# this is to make sure the unit test does not run forever if there is a bug in the scheduler +TEST_TIMEOUT_SECONDS = 5 + + +class TestScheduler: + class TestStart: + @pytest.fixture(autouse=True) + def around(self): + self.ready_queue = Queue() + self.done_queue = Queue() + self.scheduler = Scheduler( + self.running_test_process, thread_name="test_thread") + self.scheduler.start() + self.ready_queue.get(TEST_TIMEOUT_SECONDS) + yield + self.done_queue.put(True) + self.scheduler.stop() + + def running_test_process(self): + self.ready_queue.put(True) + self.done_queue.get(TEST_TIMEOUT_SECONDS) + + def test_function_run_on_new_thread(self): + assert (self.scheduler.is_running()) + assert (self.scheduler._thread.name == "test_thread") + + class TestStop: + @before + def before(self): + self.scheduler = Scheduler(lambda: True, thread_name="test_thread") + self.scheduler.start() + # Make sure thread is alive + assert (self.scheduler.is_running()) + + def test_thread_terminates_when_called_stop(self): + self.scheduler.stop() + assert (not self.scheduler.is_running()) + + class TestErrorHandling: + def test_exception_not_propagated_from_scheduled_thread(self): + self.exception_thrown = False + + def throw_exception(): + self.exception_thrown = True + raise Exception("testing") + + scheduler = \ + Scheduler(command=throw_exception, thread_name="test_thread") + scheduler.start() + scheduler._thread.join() + + assert self.exception_thrown + + def test_exception_not_thrown_when_stop_is_called_before_starting(self): + scheduler = Scheduler(lambda: True, thread_name="test_thread") + + scheduler.stop() + + class TestPauseAndResume: + @pytest.fixture(autouse=True) + def around(self): + self.ready_queue = Queue() + self.done_queue = Queue() + + def running_test_process(): + self.ready_queue.put(True) + return self.done_queue.get(TEST_TIMEOUT_SECONDS) + + self.scheduler = \ + Scheduler(running_test_process, thread_name="test_thread") + self.scheduler.start() + self.ready_queue.get(TEST_TIMEOUT_SECONDS) + # finish first execution by pushing into done_queue + # the scheduler should go into wait + self.done_queue.put(True) + + # then pause + self.scheduler.pause(block=True) + yield + self.scheduler.stop() + + def test_pause_when_scheduler_is_paused(self): + assert (self.scheduler.is_running()) + assert (self.scheduler.is_paused()) + + def test_resume_when_scheduler_is_running(self): + self.scheduler.resume(block=True) + assert (self.scheduler.is_running()) + assert (not self.scheduler.is_paused())