Allow rerunning failed runs.

* Document how to rerun failed runs. * Skip runs that have already been started.
aibasel · Jul 9, 2021 · 61e1732 · 61e1732
1 parent bfbadbd
commit 61e1732
Show file tree

Hide file tree

Showing 4 changed files with 49 additions and 10 deletions.
diff --git a/docs/faq.rst b/docs/faq.rst
@@ -18,6 +18,21 @@ How can I combine the results from multiple experiments?
     exp.add_report(AbsoluteReport())
 
 
+Some runs failed. How can I rerun them?
+---------------------------------------
+
+If the failed runs were never started, for example, due to grid node
+failures, you can simply run the "start" experiment step again. It will
+skip all runs that have already been started. Afterwards, run "fetch" and
+make reports as usual.
+
+Lab detects which runs have already been started by checking if the
+``driver.log`` file exists. So if you have failed runs that were already
+started, but you want to rerun them anyway, go to their run directories,
+remove the ``driver.log`` files and then run the "start" experiment step
+again as above.
+
+
 I forgot to parse something. How can I run only the parsers again?
 ------------------------------------------------------------------
 

diff --git a/docs/news.rst b/docs/news.rst
@@ -1,6 +1,20 @@
 Changelog
 =========
 
+next (unreleased)
+-----------------
+
+Lab
+^^^
+* Allow rerunning experiments. This is useful if some runs were never started,
+  for example, due to grid node failures. All runs that have already been started
+  are skipped. For more information see the corresponding FAQ (Jendrik Seipp).
+
+Downward Lab
+^^^^^^^^^^^^
+* None so far.
+
+
 v6.4 (2021-07-06)
 -----------------
 

diff --git a/lab/data/local-job.py.template b/lab/data/local-job.py.template
@@ -25,20 +25,28 @@ def process_task(task_id):
     run_id = get_run_id(task_id)
     run_dir = get_run_dir(run_id)
     error = False
-    with open(os.path.join(run_dir, "driver.log"), "w") as driver_log:
+    driver_log_file = os.path.join(run_dir, "driver.log")
+
+    if os.path.exists(driver_log_file):
+        logging.info(f"The run in {run_dir} has already been started --> skip it")
+        return False
+
+    with open(driver_log_file, "w") as driver_log:
         with open(os.path.join(run_dir, "driver.err"), "w") as driver_err:
             logging.info(f"Starting run {run_id} (TASK_ID {task_id}) in {run_dir}")
             try:
                 subprocess.check_call(
                     [tools.get_python_executable(), "run"],
                     cwd=run_dir, stdout=driver_log, stderr=driver_err)
-            except subprocess.CalledProcessError as err:
+            except subprocess.CalledProcessError:
                 error = True
-    if os.path.getsize(driver_err.name) != 0:
+
+    # driver.log always has content for a successful run, so we never delete it.
+    if os.path.getsize(driver_err.name) == 0:
+        os.remove(driver_err.name)
+    else:
         error = True
-    for f in [driver_log, driver_err]:
-        if os.path.getsize(f.name) == 0:
-            os.remove(f.name)
+
     return error
 
 

diff --git a/lab/data/slurm-run-job-body.template b/lab/data/slurm-run-job-body.template
@@ -11,6 +11,11 @@ function print_run_dir {
 }
 
 function execute_run {
+    if [[ -f driver.log ]]; then
+        echo "The run in $(pwd) has already been started --> skip it"
+        return
+    fi
+
     (
     "%(python)s" run
     RETCODE=$?
@@ -19,10 +24,7 @@ function execute_run {
     fi
     ) > driver.log 2> driver.err
 
-    # Delete empty driver files.
-    if [[ ! -s driver.log ]]; then
-        rm driver.log
-    fi
+    # Delete empty driver.err files. driver.log always has content (for started runs).
     if [[ ! -s driver.err ]]; then
         rm driver.err
     fi