Skip to content
This repository has been archived by the owner on Dec 16, 2022. It is now read-only.

Commit

Permalink
improve worker error handling in MultiProcessDataLoader (#4912)
Browse files Browse the repository at this point in the history
* improve worker error handling

* rename test file
  • Loading branch information
epwalsh committed Jan 14, 2021
1 parent 94dd9cc commit d7c9eab
Show file tree
Hide file tree
Showing 2 changed files with 14 additions and 8 deletions.
22 changes: 14 additions & 8 deletions allennlp/data/data_loaders/multiprocess_data_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
from multiprocessing.process import BaseProcess
from os import PathLike
import random
import sys
import traceback
from typing import List, Iterator, Optional, Iterable, Union

Expand Down Expand Up @@ -399,8 +398,7 @@ def _iter_batches(self) -> Iterator[TensorDict]:
for batch, worker_error in iter(queue.get, (None, None)):
if worker_error is not None:
e, tb = worker_error
sys.stderr.write("".join(tb))
raise WorkerError(e)
raise WorkerError(e, tb)

if not self._worker_cuda_safe and self.cuda_device is not None:
# Need to move batch to target device now.
Expand Down Expand Up @@ -473,7 +471,7 @@ def _instance_worker(self, worker_id: int, queue: mp.JoinableQueue) -> None:
checked_for_token_indexers = True
queue.put((instance, None))
except Exception as e:
queue.put((None, (str(e), traceback.format_exc())))
queue.put((None, (repr(e), traceback.format_exc())))

# Indicate to the consumer that this worker is finished.
queue.put((None, None))
Expand All @@ -490,7 +488,7 @@ def _batch_worker(self, worker_id: int, queue: mp.JoinableQueue) -> None:
):
queue.put((batch, None))
except Exception as e:
queue.put((None, (str(e), traceback.format_exc())))
queue.put((None, (repr(e), traceback.format_exc())))

# Indicate to the consumer (main thread) that this worker is finished.
queue.put((None, None))
Expand All @@ -504,8 +502,7 @@ def _gather_instances(self, queue: mp.JoinableQueue) -> Iterable[Instance]:
for instance, worker_error in iter(queue.get, (None, None)):
if worker_error is not None:
e, tb = worker_error
sys.stderr.write("".join(tb))
raise WorkerError(e)
raise WorkerError(e, tb)

self.reader.apply_token_indexers(instance)
if self._vocab is not None:
Expand Down Expand Up @@ -574,4 +571,13 @@ class WorkerError(Exception):
An error raised when a worker fails.
"""

pass
def __init__(self, original_err_repr: str, traceback: List[str]) -> None:
super().__init__(
f"worker raised {original_err_repr}\n\n"
" Traceback from worker:\n " + "".join(traceback)
# Remove the first line of the traceback since it's redundant.
.replace("Traceback (most recent call last):\n", "")
# Give a little indentation so it's clear this traceback is separate from the traceback
# in the main process.
.replace("\n", "\n ")
)

0 comments on commit d7c9eab

Please sign in to comment.