Increase minimum job poll interval to reduce verdi daemon CPU usage (#…

…466) Set the job poll interval of 2 seconds. This should keep the CPU utilization below 10% according to @cpignedoli benchmarks in #462. At the same time, 5s felt too long, since users might want to run super quick calculations to try things out. AiiDA not yet provide API to set the interval, so the python code is directly inject to the start up script to set.
aiidalab · Jun 11, 2024 · 917ca0e · 917ca0e
1 parent 503d11b
commit 917ca0e
Show file tree

Hide file tree

Showing 3 changed files with 28 additions and 5 deletions.
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -21,6 +21,9 @@ on:
                 required: false
                 type: boolean
 
+env:
+    FORCE_COLOR: 1
+
 jobs:
 
     test:
@@ -53,5 +56,5 @@ jobs:
                   pip freeze
 
             - name: Run tests
-              run: pytest -m "${{ inputs.integration && 'integration' || 'not integration' }}" --target ${{inputs.target}}
+              run: pytest -s -m "${{ inputs.integration && 'integration' || 'not integration' }}" --target ${{inputs.target}}
               env: ${{ fromJSON(inputs.images) }}
diff --git a/stack/base/before-notebook.d/40_prepare-aiida.sh b/stack/base/before-notebook.d/40_prepare-aiida.sh
@@ -55,10 +55,22 @@ if [[ ${NEED_SETUP_PROFILE} == true ]]; then
         --work-dir /home/${NB_USER}/aiida_run/                          \
         --mpirun-command "mpirun -np {tot_num_mpiprocs}"                \
         --mpiprocs-per-machine ${LOCALHOST_MPI_PROCS_PER_MACHINE} &&    \
-    verdi computer configure core.local "${computer_name}" \
+    verdi computer configure core.local "${computer_name}"              \
         --non-interactive                                               \
         --safe-interval 0.0
 
+    # We need to limit how often the daemon worker polls the job scheduler
+    # for job status. The poll interval is set to 0s by default, which results
+    # in verdi worker spinning at 100% CPU.
+    # We set this to 2.0 seconds which should limit the CPU utilization below 10%.
+    # https://aiida.readthedocs.io/projects/aiida-core/en/stable/howto/run_codes.html#mitigating-connection-overloads
+    job_poll_interval="2.0"
+    python -c "
+from aiida import load_profile; from aiida.orm import load_computer;
+load_profile();
+load_computer('${computer_name}').set_minimum_job_poll_interval(${job_poll_interval})
+"
+
 else
 
   # Migration will run for the default profile.

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -62,11 +62,19 @@ def docker_compose_file(pytestconfig):
 @pytest.fixture(scope="session")
 def notebook_service(docker_ip, docker_services):
     """Ensure that HTTP service is up and responsive."""
+
+    # using `docker_compose` fixture would trigger a separate container
+    docker_compose = docker_services._docker_compose
     port = docker_services.port_for("aiidalab", 8888)
     url = f"http://{docker_ip}:{port}"
-    docker_services.wait_until_responsive(
-        timeout=60.0, pause=0.1, check=lambda: is_responsive(url)
-    )
+    try:
+        docker_services.wait_until_responsive(
+            timeout=60.0, pause=0.1, check=lambda: is_responsive(url)
+        )
+    except Exception as e:
+        print(docker_compose.execute("logs").decode().strip())
+        # Let's exit hard, otherwise pytest output is a huge mess.
+        pytest.exit(e)
     return url