aiidateam · mbercx · Nov 22, 2020 · Nov 21, 2020 · Nov 22, 2020 · Nov 22, 2020
diff --git a/aiida/engine/processes/calcjobs/tasks.py b/aiida/engine/processes/calcjobs/tasks.py
@@ -22,6 +22,7 @@
 from aiida.engine.daemon import execmanager
 from aiida.engine.utils import exponential_backoff_retry, interruptable_task
 from aiida.schedulers.datastructures import JobState
+from aiida.manage.configuration import get_config
 
 from ..process import ProcessState
 
@@ -31,8 +32,8 @@
 RETRIEVE_COMMAND = 'retrieve'
 KILL_COMMAND = 'kill'
 
-TRANSPORT_TASK_RETRY_INITIAL_INTERVAL = 20
-TRANSPORT_TASK_MAXIMUM_ATTEMTPS = 5
+RETRY_INTERVAL_OPTION = 'transport.task_retry_initial_interval'
+MAX_ATTEMPTS_OPTION = 'transport.task_maximum_attempts'
 
 logger = logging.getLogger(__name__)
 
@@ -63,8 +64,8 @@ def task_upload_job(process, transport_queue, cancellable):
         logger.warning(f'CalcJob<{node.pk}> already marked as SUBMITTING, skipping task_update_job')
         raise Return
 
-    initial_interval = TRANSPORT_TASK_RETRY_INITIAL_INTERVAL
-    max_attempts = TRANSPORT_TASK_MAXIMUM_ATTEMTPS
+    initial_interval = get_config().get_option(RETRY_INTERVAL_OPTION)
+    max_attempts = get_config().get_option(MAX_ATTEMPTS_OPTION)
 
     authinfo = node.computer.get_authinfo(node.user)
 
@@ -124,8 +125,8 @@ def task_submit_job(node, transport_queue, cancellable):
         logger.warning(f'CalcJob<{node.pk}> already marked as WITHSCHEDULER, skipping task_submit_job')
         raise Return(node.get_job_id())
 
-    initial_interval = TRANSPORT_TASK_RETRY_INITIAL_INTERVAL
-    max_attempts = TRANSPORT_TASK_MAXIMUM_ATTEMTPS
+    initial_interval = get_config().get_option(RETRY_INTERVAL_OPTION)
+    max_attempts = get_config().get_option(MAX_ATTEMPTS_OPTION)
 
     authinfo = node.computer.get_authinfo(node.user)
 
@@ -172,8 +173,8 @@ def task_update_job(node, job_manager, cancellable):
         logger.warning(f'CalcJob<{node.pk}> already marked as RETRIEVING, skipping task_update_job')
         raise Return(True)
 
-    initial_interval = TRANSPORT_TASK_RETRY_INITIAL_INTERVAL
-    max_attempts = TRANSPORT_TASK_MAXIMUM_ATTEMTPS
+    initial_interval = get_config().get_option(RETRY_INTERVAL_OPTION)
+    max_attempts = get_config().get_option(MAX_ATTEMPTS_OPTION)
 
     authinfo = node.computer.get_authinfo(node.user)
     job_id = node.get_job_id()
@@ -233,8 +234,8 @@ def task_retrieve_job(node, transport_queue, retrieved_temporary_folder, cancell
         logger.warning(f'CalcJob<{node.pk}> already marked as PARSING, skipping task_retrieve_job')
         raise Return
 
-    initial_interval = TRANSPORT_TASK_RETRY_INITIAL_INTERVAL
-    max_attempts = TRANSPORT_TASK_MAXIMUM_ATTEMTPS
+    initial_interval = get_config().get_option(RETRY_INTERVAL_OPTION)
+    max_attempts = get_config().get_option(MAX_ATTEMPTS_OPTION)
 
     authinfo = node.computer.get_authinfo(node.user)
 
@@ -291,8 +292,8 @@ def task_kill_job(node, transport_queue, cancellable):
     :raises: Return if the tasks was successfully completed
     :raises: TransportTaskException if after the maximum number of retries the transport task still excepted
     """
-    initial_interval = TRANSPORT_TASK_RETRY_INITIAL_INTERVAL
-    max_attempts = TRANSPORT_TASK_MAXIMUM_ATTEMTPS
+    initial_interval = get_config().get_option(RETRY_INTERVAL_OPTION)
+    max_attempts = get_config().get_option(MAX_ATTEMPTS_OPTION)
 
     if node.get_state() in [CalcJobState.UPLOADING, CalcJobState.SUBMITTING]:
         logger.warning(f'CalcJob<{node.pk}> killed, it was in the {node.get_state()} state')

diff --git a/aiida/engine/utils.py b/aiida/engine/utils.py
@@ -154,7 +154,8 @@ def exponential_backoff_retry(fct, initial_interval=10.0, max_attempts=5, logger
     This coroutine will loop ``max_attempts`` times, calling the ``fct`` function, breaking immediately when the call
     finished without raising an exception, at which point the returned result will be raised, wrapped in a
     ``tornado.gen.Result`` instance. If an exception is caught, the function will yield a ``tornado.gen.sleep`` with a
-    time interval equal to the ``initial_interval`` multiplied by ``2*N`` where ``N`` is the number of excepted calls.
+    time interval equal to the ``initial_interval`` multiplied by ``2 ** (N - 1)`` where ``N`` is the number of
+    excepted calls.
 
     :param fct: the function to call, which will be turned into a coroutine first if it is not already
     :param initial_interval: the time to wait after the first caught exception before calling the coroutine again

diff --git a/aiida/manage/configuration/options.py b/aiida/manage/configuration/options.py
@@ -185,6 +185,22 @@
         'description': 'Boolean whether to print AiiDA deprecation warnings',
         'global_only': False,
     },
+    'transport.task_retry_initial_interval': {
+        'key': 'task_retry_initial_interval',
+        'valid_type': 'int',
+        'valid_values': None,
+        'default': 20,
+        'description': 'Initial time interval for the exponential backoff mechanism.',
+        'global_only': False,
+    },
+    'transport.task_maximum_attempts': {
+        'key': 'task_maximum_attempts',
+        'valid_type': 'int',
+        'valid_values': None,
+        'default': 5,
+        'description': 'Maximum number of transport task attempts before a Process is Paused.',
+        'global_only': False,
+    },
 }