From ceb4a4ffa8b461d844ba7742e5b904c378a4e67d Mon Sep 17 00:00:00 2001
From: mukileshPira <mukilesh@amazon.co.uk>
Date: Thu, 8 Jul 2021 00:02:28 +0100
Subject: [PATCH 1/4] Fixing bug in calculating active time since start

In case of lambda where there are pause times, it should be considered when calculating active time. Essentially total pause time should be removed from [end-start]. Earlier we were considering the last sample time to be the end of the profile but in some corner cases where there is huge pause after that we would include it in the calculation even though pause is after the "end" of the profile.

So updating the way we calculate "end" of the profile. For more detailed information, please have a look at the comments.

Also, added some debug logs.
---
 codeguru_profiler_agent/model/profile.py | 42 +++++++++++++++++++++---
 1 file changed, 38 insertions(+), 4 deletions(-)

diff --git a/codeguru_profiler_agent/model/profile.py b/codeguru_profiler_agent/model/profile.py
index 82f5cb3..866eed7 100644
--- a/codeguru_profiler_agent/model/profile.py
+++ b/codeguru_profiler_agent/model/profile.py
@@ -1,4 +1,5 @@
 import time
+import logging
 
 from random import SystemRandom
 from codeguru_profiler_agent.model.call_graph_node import CallGraphNode
@@ -9,6 +10,7 @@
 
 ROOT_NODE_NAME = "ALL"
 
+logger = logging.getLogger(__name__)
 
 class Profile:
     def __init__(self, profiling_group_name, sampling_interval_seconds, host_weight, start, agent_debug_info, clock=time.time):
@@ -56,11 +58,43 @@ def get_active_millis_since_start(self):
         This returns the total "active" wall clock time since start. In AWS lambda the process can be frozen, the
         time while we are frozen should not be counted in here. In an EC2 or other type of host it is simply the wall
         clock time since start.
-        If the "end" time has been set, we give the active time between start and end otherwise between start and now
+
+        Previously self.end was used to calculate active_millis_since_start but self.end is updated when a sample is added
+        so in rare cases where a sample is collected before the last pause time then we might add additional pause time
+        which can lead to incorrect calculation of active time. In worst cases, it can even lead to negative values depending
+        on the pause time.
+
+        Below is an example indicating the potential error of considering self.end in the calculation.
+        ------------------------------------------------------------------
+        |          |        |    |         |    |     |       |     ||||||
+        S          P        R    P         R    SE    P       R     REPORT
+        s1_________|___p1___|____|____p2___|____e1____|___p3__|__________|
+
+        S - Start
+        P - Pause
+        R - Resume
+        SE - self.end (last sample)
+        REPORT - Calculating active time.
+
+        If we consider self.end which is e1 in above case then active time would be e1-s1-(p1+p2+p3). But pause p3 is after e1
+        so that leads to incorrect calculation of active time.
+
+        Ideally we would want to set profile_end to be last sample time and subtract pause times only before that but it requires
+        additional work in maintaining pause time which isn't worth as it makes the logic complex with very little gain.
+
+        So we are setting it to current time and in some corner cases to last_pause time.
+
+        Additional note: last_pause is set to none as part of resume and _paused_ms is added as part of resume. So we won't
+        end up in above state in those corner cases where we select last_pause.
         """
-        end = self.last_pause if self.last_pause is not None else \
-            self._end if self._end is not None else current_milli_time(clock=self._clock)
-        return end - self.start - self._paused_ms
+        end = self.last_pause if self.last_pause is not None else current_milli_time(clock=self._clock)
+        active_time_millis_since_start = end - self.start - self._paused_ms
+        logger.debug(
+            "Active time since start is {activeTime} which is calculated using start: {start}, end: {end}, last_pause: {last_pause}, paused_ms: {paused_ms}, last_resume: {last_resume}"
+            .format(activeTime = active_time_millis_since_start, start = self.start, end = self._end, last_pause = self.last_pause, paused_ms = self._paused_ms, last_resume = self.last_resume)
+        )
+
+        return active_time_millis_since_start
 
     def add(self, sample):
         """

From 9952f7106c28e40dbd5a228ca3df195f4bf2dda5 Mon Sep 17 00:00:00 2001
From: mukileshPira <mukilesh@amazon.co.uk>
Date: Thu, 8 Jul 2021 15:58:14 +0100
Subject: [PATCH 2/4] Fixing unit tests

---
 test/unit/model/test_profile.py                    | 4 ++--
 test/unit/sdk_reporter/test_sdk_profile_encoder.py | 6 +++---
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/test/unit/model/test_profile.py b/test/unit/model/test_profile.py
index 07c0537..895dd48 100644
--- a/test/unit/model/test_profile.py
+++ b/test/unit/model/test_profile.py
@@ -420,10 +420,10 @@ def before(self):
         self.subject.resume()
         self.turn_clock(seconds=5)
         self.subject.end = self.mock_clock() * 1000
-        self.turn_clock(seconds=1)  # that extra second should not be included in the duration
+        self.turn_clock(seconds=5)
 
     def test_it_returns_duration_up_to_end_time(self):
-        assert self.subject.get_active_millis_since_start() == 14000
+        assert self.subject.get_active_millis_since_start() == 19000
 
 
 class TestSetOverhead(TestProfile):
diff --git a/test/unit/sdk_reporter/test_sdk_profile_encoder.py b/test/unit/sdk_reporter/test_sdk_profile_encoder.py
index 2332a63..cf6df1b 100644
--- a/test/unit/sdk_reporter/test_sdk_profile_encoder.py
+++ b/test/unit/sdk_reporter/test_sdk_profile_encoder.py
@@ -23,9 +23,9 @@
 
 def example_profile():
     start_time = 1514764800000
-    end_time = 1514772000000
+    end_time = 1514771000000
     profile = Profile(profiling_group_name="TestProfilingGroupName", sampling_interval_seconds=1.0, host_weight=2,
-                      start=start_time, agent_debug_info=AgentDebugInfo(errors_metadata), clock=lambda: 1514772000000)
+                      start=start_time, agent_debug_info=AgentDebugInfo(errors_metadata), clock=lambda: 1514772000)
     profile.add(
         Sample(stacks=[[Frame("bottom"), Frame("middle"), Frame("top")],
                        [Frame("bottom"), Frame("middle"), Frame("different_top")],
@@ -89,7 +89,7 @@ def test_it_includes_the_start_time_from_the_profile_in_epoch_millis(
 
     def test_it_includes_the_end_time_from_the_profile_in_epoch_millis(
             self):
-        assert (self.decoded_json_result()["end"] == 1514772000000)
+        assert (self.decoded_json_result()["end"] == 1514771000000)
 
     def test_it_includes_the_agent_info_in_the_agent_metadata(self):
         assert (self.decoded_json_result()["agentMetadata"]["agentInfo"]["type"] ==

From 70569f7a692272df2df36eadc7ca15b80cd7b7a1 Mon Sep 17 00:00:00 2001
From: mukileshPira <mukilesh@amazon.co.uk>
Date: Thu, 8 Jul 2021 17:01:56 +0100
Subject: [PATCH 3/4] Updating comment and minor change in resume

---
 codeguru_profiler_agent/model/profile.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/codeguru_profiler_agent/model/profile.py b/codeguru_profiler_agent/model/profile.py
index 866eed7..a55a040 100644
--- a/codeguru_profiler_agent/model/profile.py
+++ b/codeguru_profiler_agent/model/profile.py
@@ -83,9 +83,6 @@ def get_active_millis_since_start(self):
         additional work in maintaining pause time which isn't worth as it makes the logic complex with very little gain.
 
         So we are setting it to current time and in some corner cases to last_pause time.
-
-        Additional note: last_pause is set to none as part of resume and _paused_ms is added as part of resume. So we won't
-        end up in above state in those corner cases where we select last_pause.
         """
         end = self.last_pause if self.last_pause is not None else current_milli_time(clock=self._clock)
         active_time_millis_since_start = end - self.start - self._paused_ms
@@ -147,8 +144,8 @@ def resume(self):
             # resume gets called when profile is running
             return
         self.last_resume = current_milli_time(clock=self._clock)
-        self._paused_ms += self.last_resume - self.last_pause
         self.last_pause = None
+        self._paused_ms += self.last_resume - self.last_pause
 
     def is_empty(self):
         return self.total_seen_threads_count == 0.0

From 59e25e06265ea7b49416d885439d831ebc761d1b Mon Sep 17 00:00:00 2001
From: mukileshPira <mukilesh@amazon.co.uk>
Date: Thu, 8 Jul 2021 17:06:34 +0100
Subject: [PATCH 4/4] Fixing unit tests for latest changes

---
 codeguru_profiler_agent/model/profile.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/codeguru_profiler_agent/model/profile.py b/codeguru_profiler_agent/model/profile.py
index a55a040..929defc 100644
--- a/codeguru_profiler_agent/model/profile.py
+++ b/codeguru_profiler_agent/model/profile.py
@@ -144,8 +144,9 @@ def resume(self):
             # resume gets called when profile is running
             return
         self.last_resume = current_milli_time(clock=self._clock)
+        prev_last_pause = self.last_pause
         self.last_pause = None
-        self._paused_ms += self.last_resume - self.last_pause
+        self._paused_ms += self.last_resume - prev_last_pause
 
     def is_empty(self):
         return self.total_seen_threads_count == 0.0