alephdata · catileptic · Jul 19, 2024 · Jul 1, 2024 · Jul 1, 2024 · Jul 1, 2024
diff --git a/requirements-dev.txt b/requirements-dev.txt
@@ -5,3 +5,4 @@ pytest-env==1.1.3
 pytest-cov==5.0.0
 pytest-mock==3.14.0
 wheel==0.43.0
+time-machine==2.14.1
diff --git a/servicelayer/taskqueue.py b/servicelayer/taskqueue.py
@@ -83,7 +83,6 @@ def __init__(self, conn, name):
         self.running_key = make_key(PREFIX, "qdj", name, "running")
         self.pending_key = make_key(PREFIX, "qdj", name, "pending")
         self.start_key = make_key(PREFIX, "qdj", name, "start")
-        self.end_key = make_key(PREFIX, "qdj", name, "end")
         self.last_update_key = make_key(PREFIX, "qdj", name, "last_update")
         self.active_stages_key = make_key(PREFIX, "qds", name, "active_stages")
 
@@ -115,19 +114,14 @@ def cancel(self):
         """Cancel processing of all tasks belonging to a dataset"""
         pipe = self.conn.pipeline()
         self.flush_status(pipe)
-        # What should happen to the end_key in this case?
-        pipe.delete(self.end_key)
         pipe.execute()
 
     def get_status(self):
         """Status of a given dataset."""
         status = {"finished": 0, "running": 0, "pending": 0, "stages": []}
 
-        start, end, last_update = self.conn.mget(
-            (self.start_key, self.end_key, self.last_update_key)
-        )
+        start, last_update = self.conn.mget((self.start_key, self.last_update_key))
         status["start_time"] = start
-        status["end_time"] = end
         status["last_update"] = last_update
 
         for stage in self.conn.smembers(self.active_stages_key):
@@ -226,9 +220,8 @@ def add_task(self, task_id, stage):
         pipe.sadd(self.pending_key, task_id)
 
         # update dataset timestamps
-        pipe.set(self.start_key, pack_now())
+        pipe.set(self.start_key, pack_now(), nx=True)
         pipe.set(self.last_update_key, pack_now())
-        pipe.delete(self.end_key)
         pipe.execute()
 
     def remove_task(self, task_id, stage):
@@ -279,9 +272,8 @@ def checkout_task(self, task_id, stage):
         pipe.srem(self.pending_key, task_id)
 
         # update dataset timestamps
-        pipe.set(self.start_key, pack_now())
+        pipe.set(self.start_key, pack_now(), nx=True)
         pipe.set(self.last_update_key, pack_now())
-        pipe.delete(self.end_key)
         pipe.execute()
 
     def mark_done(self, task: Task):
@@ -309,8 +301,8 @@ def mark_done(self, task: Task):
         pipe.incr(make_key(stage_key, "finished"))
 
         # update dataset timestamps
-        pipe.set(self.end_key, pack_now())
         pipe.set(self.last_update_key, pack_now())
+
         pipe.execute()
 
         if self.is_done():

diff --git a/setup.py b/setup.py
@@ -51,6 +51,7 @@
             "pytest >= 3.6",
             "coverage",
             "pytest-cov",
+            "time-machine>=2.14.1, <3.0.0",
         ],
     },
     test_suite="tests",

diff --git a/tests/test_taskqueue.py b/tests/test_taskqueue.py
@@ -3,6 +3,7 @@
 from unittest.mock import patch
 import json
 from random import randrange
+import time_machine
 
 import pika
 from prometheus_client import REGISTRY
@@ -73,7 +74,6 @@ def test_task_queue(self):
         assert status["finished"] == 0, status
         assert status["pending"] == 1, status
         assert status["running"] == 0, status
-        assert status["end_time"] is None
         started = unpack_datetime(status["start_time"])
         last_updated = unpack_datetime(status["last_update"])
         assert started < last_updated
@@ -121,10 +121,8 @@ def test_task_queue(self):
         assert status["finished"] == 0, status
         assert status["pending"] == 0, status
         assert status["running"] == 0, status
-        # started = unpack_datetime(status["start_time"])
-        # last_updated = unpack_datetime(status["last_update"])
-        # end_time = unpack_datetime(status["end_time"])
-        # assert started < end_time < last_updated
+        assert status["start_time"] is None
+        assert status["last_update"] is None
 
     @patch("servicelayer.taskqueue.Dataset.should_execute")
     def test_task_that_shouldnt_execute(self, mock_should_execute):
@@ -192,6 +190,199 @@ def did_nack():
         assert dataset.is_task_tracked(Task(**body))
 
 
+def test_dataset_get_status():
+    conn = get_fakeredis()
+    conn.flushdb()
+
+    dataset = Dataset(conn=conn, name="123")
+    status = dataset.get_status()
+
+    assert status["pending"] == 0
+    assert status["running"] == 0
+    assert status["finished"] == 0
+    assert status["start_time"] is None
+    assert status["last_update"] is None
+
+    task_one = Task(
+        task_id="1",
+        job_id="abc",
+        delivery_tag="",
+        operation="ingest",
+        context={},
+        payload={},
+        priority=5,
+        collection_id="1",
+    )
+
+    task_two = Task(
+        task_id="2",
+        job_id="abc",
+        delivery_tag="",
+        operation="ingest",
+        context={},
+        payload={},
+        priority=5,
+        collection_id="1",
+    )
+
+    task_three = Task(
+        task_id="3",
+        job_id="abc",
+        delivery_tag="",
+        operation="index",
+        context={},
+        payload={},
+        priority=5,
+        collection_id="1",
+    )
+
+    # Adding a task updates `start_time` and `last_update`
+    with time_machine.travel("2024-01-01T00:00:00"):
+        dataset.add_task(task_one.task_id, task_one.operation)
+
+    status = dataset.get_status()
+    assert status["pending"] == 1
+    assert status["running"] == 0
+    assert status["finished"] == 0
+    assert status["start_time"].startswith("2024-01-01T00:00:00")
+    assert status["last_update"].startswith("2024-01-01T00:00:00")
+
+    # Once a worker starts processing a task, only `last_update` is updated
+    with time_machine.travel("2024-01-02T00:00:00"):
+        dataset.checkout_task(task_one.task_id, task_one.operation)
+
+    status = dataset.get_status()
+    assert status["pending"] == 0
+    assert status["running"] == 1
+    assert status["finished"] == 0
+    assert status["start_time"].startswith("2024-01-01T00:00:00")
+    assert status["last_update"].startswith("2024-01-02T00:00:00")
+
+    # When another task is added, only `last_update` is updated
+    with time_machine.travel("2024-01-03T00:00:00"):
+        dataset.add_task(task_two.task_id, task_two.operation)
+
+    status = dataset.get_status()
+    assert status["pending"] == 1
+    assert status["running"] == 1
+    assert status["finished"] == 0
+    assert status["start_time"].startswith("2024-01-01T00:00:00")
+    assert status["last_update"].startswith("2024-01-03T00:00:00")
+
+    # When the first task has been processed, `last_update` is updated
+    with time_machine.travel("2024-01-04T00:00:00"):
+        dataset.mark_done(task_one)
+
+    status = dataset.get_status()
+    assert status["pending"] == 1
+    assert status["running"] == 0
+    assert status["finished"] == 1
+    assert status["start_time"].startswith("2024-01-01T00:00:00")
+    assert status["last_update"].startswith("2024-01-04T00:00:00")
+
+    # When the worker starts processing the second task, only `last_update` is updated
+    with time_machine.travel("2024-01-05T00:00:00"):
+        dataset.checkout_task(task_two.task_id, task_two.operation)
+
+    status = dataset.get_status()
+    assert status["pending"] == 0
+    assert status["running"] == 1
+    assert status["finished"] == 1
+    assert status["start_time"].startswith("2024-01-01T00:00:00")
+    assert status["last_update"].startswith("2024-01-05T00:00:00")
+
+    # Once all tasks have been processed, status data is flushed
+    with time_machine.travel("2024-01-06T00:00:00"):
+        dataset.mark_done(task_two)
+
+    status = dataset.get_status()
+    assert status["pending"] == 0
+    assert status["running"] == 0
+    assert status["finished"] == 0
+    assert status["start_time"] is None
+    assert status["last_update"] is None
+
+    # Adding a new task to an inactive dataset sets `start_time`
+    with time_machine.travel("2024-01-07T00:00:00"):
+        dataset.add_task(task_three.task_id, task_three.operation)
+
+    status = dataset.get_status()
+    assert status["pending"] == 1
+    assert status["running"] == 0
+    assert status["finished"] == 0
+    assert status["start_time"].startswith("2024-01-07T00:00:00")
+    assert status["last_update"].startswith("2024-01-07T00:00:00")
+
+    # Cancelling a dataset flushes status data
+    with time_machine.travel("2024-01-08T00:00:00"):
+        dataset.checkout_task(task_three.task_id, task_three.operation)
+        dataset.cancel()
+
+    status = dataset.get_status()
+    assert status["pending"] == 0
+    assert status["running"] == 0
+    assert status["finished"] == 0
+    assert status["start_time"] is None
+    assert status["last_update"] is None
+
+    # Tasks that were already running when the dataset was cancelled
+    # have no effect
+    with time_machine.travel("2024-01-09T00:00:00"):
+        dataset.mark_done(task_three)
+
+    assert status["pending"] == 0
+    assert status["running"] == 0
+    assert status["finished"] == 0
+    assert status["start_time"] is None
+    assert status["last_update"] is None
+
+
+def test_dataset_cancel():
+    conn = get_fakeredis()
+    conn.flushdb()
+
+    dataset = Dataset(conn=conn, name="abc")
+    assert conn.keys() == []
+
+    # Enqueueing tasks stores status data in Redis
+    dataset.add_task("1", "ingest")
+    dataset.add_task("2", "index")
+    dataset.checkout_task("1", "ingest")
+    assert conn.keys() != []
+
+    # Cancelling a dataset removes associated data from Redis
+    dataset.cancel()
+    assert conn.keys() == []
+
+
+def test_dataset_mark_done():
+    conn = get_fakeredis()
+    conn.flushdb()
+
+    dataset = Dataset(conn=conn, name="abc")
+    assert conn.keys() == []
+
+    task = Task(
+        task_id="1",
+        job_id="abc",
+        delivery_tag="",
+        operation="ingest",
+        context={},
+        payload={},
+        priority=5,
+        collection_id="abc",
+    )
+
+    # Enqueueing a task stores status data in Redis
+    dataset.add_task(task.task_id, task.operation)
+    dataset.checkout_task(task.task_id, task.operation)
+    assert conn.keys() != []
+
+    # Marking the last task as done cleans up status data in Redis
+    dataset.mark_done(task)
+    assert conn.keys() == []
+
+
 @pytest.fixture
 def prom_registry():
     # This relies on internal implementation details of the client to reset
@@ -325,7 +516,6 @@ def test_get_priority_bucket():
                         }
                     ],
                     "start_time": "2024-06-25T10:58:49.779811",
-                    "end_time": None,
                     "last_update": "2024-06-25T10:58:49.779819",
                 }
             },
@@ -354,7 +544,6 @@ def test_get_priority_bucket():
                         }
                     ],
                     "start_time": "2024-06-25T10:58:49.779811",
-                    "end_time": None,
                     "last_update": "2024-06-25T10:58:49.779819",
                 }
             },