Merge branch 'master' into vision

allenai · Jul 20, 2020 · 6cc508d · 6cc508d
2 parents f87df83 + 478bf46
commit 6cc508d
Show file tree

Hide file tree

Showing 8 changed files with 86 additions and 42 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,6 +7,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## Unreleased
 
+### Fixed
+
+- Removed unnecessary warning about deadlocks in `DataLoader`.
+- Use slower tqdm intervals when output is being piped or redirected.
+
+
 ## [v1.1.0rc1](https://github.com/allenai/allennlp/releases/tag/v1.1.0rc1) - 2020-07-14
 
 ### Fixed
@@ -31,7 +37,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
   in case it does not have a tokenizer.
 - `reg_loss` is only now returned for models that have some regularization penalty configured.
 - Fixed a bug that prevented `cached_path` from downloading assets from GitHub releases.
-- Fixed a bug that erronously increased last label's false positive count in calculating fbeta metrics.
+- Fixed a bug that erroneously increased last label's false positive count in calculating fbeta metrics.
 - `Tqdm` output now looks much better when the output is being piped or redirected.
 - Small improvements to how the API documentation is rendered.
 - Only show validation progress bar from main process in distributed training.
@@ -50,7 +56,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
   scalar mix of all hidden layers from the transformer model instead of just the last layer. To utilize
   this, just set `last_layer_only` to `False`.
 - `cached_path()` can now read files inside of archives.
-- Training metrics now include per-batch loss in addition to aggregate loss across number of batches.
+- Training metrics now include `batch_loss` and `batch_reg_loss` in addition to aggregate loss across number of batches.
 
 ### Changed
 

diff --git a/allennlp/common/tqdm.py b/allennlp/common/tqdm.py
@@ -73,6 +73,8 @@ def flush(self):
 class Tqdm:
     @staticmethod
     def tqdm(*args, **kwargs):
-        new_kwargs = {"file": TqdmToLogsWriter(), **kwargs}
+        # Use a slow interval if the output is being piped or redirected.
+        default_mininterval = 0.1 if sys.stderr.isatty() else 10.0
+        new_kwargs = {"file": TqdmToLogsWriter(), "mininterval": default_mininterval, **kwargs}
 
         return _tqdm(*args, **new_kwargs)
diff --git a/allennlp/data/dataloader.py b/allennlp/data/dataloader.py
@@ -1,13 +1,11 @@
 from typing import List, Dict, Union, Iterator
-import warnings
 
 import torch
 from torch.utils import data
 
 from allennlp.common.registrable import Registrable
 from allennlp.common.lazy import Lazy
 from allennlp.data.instance import Instance
-from allennlp.data.dataset_readers.dataset_reader import AllennlpLazyDataset
 from allennlp.data.batch import Batch
 from allennlp.data.samplers import Sampler, BatchSampler
 
@@ -87,13 +85,6 @@ def __init__(
         multiprocessing_context: str = None,
         batches_per_epoch: int = None,
     ):
-        if num_workers and isinstance(dataset, AllennlpLazyDataset):
-            warnings.warn(
-                "Using multi-process data loading with a lazy dataset could lead to "
-                "deadlocks with certain tokenizers. See:\n"
-                "  https://github.com/allenai/allennlp/issues/4330\n",
-                UserWarning,
-            )
         super().__init__(
             dataset=dataset,
             batch_size=batch_size,

diff --git a/allennlp/training/trainer.py b/allennlp/training/trainer.py
@@ -507,11 +507,14 @@ def _train_epoch(self, epoch: int) -> Dict[str, float]:
         regularization_penalty = self.model.get_regularization_penalty()
 
         train_loss = 0.0
+        batch_loss = 0.0
 
         if regularization_penalty is not None:
             train_reg_loss = 0.0
+            batch_reg_loss = 0.0
         else:
             train_reg_loss = None
+            batch_reg_loss = None
         # Set the model to "train" mode.
         self._pytorch_model.train()
 
@@ -588,10 +591,12 @@ def _train_epoch(self, epoch: int) -> Dict[str, float]:
                         scaled_loss.backward()
                 else:
                     loss.backward()
-                train_loss += loss.item()
+                batch_loss = loss.item()
+                train_loss += batch_loss
                 if reg_loss is not None:
                     reg_loss = reg_loss / len(batch_group)
-                    train_reg_loss += reg_loss.item()
+                    batch_reg_loss = reg_loss.item()
+                    train_reg_loss += batch_reg_loss
 
             batch_grad_norm = self.rescale_gradients()
 
@@ -627,6 +632,8 @@ def _train_epoch(self, epoch: int) -> Dict[str, float]:
                 self.model,
                 train_loss,
                 train_reg_loss,
+                batch_loss,
+                batch_reg_loss,
                 batches_this_epoch,
                 world_size=self._world_size,
                 cuda_device=self.cuda_device,
@@ -675,7 +682,9 @@ def _train_epoch(self, epoch: int) -> Dict[str, float]:
             self.model,
             train_loss,
             train_reg_loss,
-            batches_this_epoch,
+            batch_loss=None,
+            batch_reg_loss=None,
+            num_batches=batches_this_epoch,
             reset=True,
             world_size=self._world_size,
             cuda_device=self.cuda_device,
@@ -717,10 +726,13 @@ def _validation_loss(self, epoch: int) -> Tuple[float, float, int]:
 
         batches_this_epoch = 0
         val_loss = 0
+        val_batch_loss = 0
         if regularization_penalty is not None:
             val_reg_loss = 0
+            val_batch_reg_loss = 0
         else:
             val_reg_loss = None
+            val_batch_reg_loss = None
         done_early = False
         for batch in val_generator_tqdm:
             if self._distributed:
@@ -752,15 +764,19 @@ def _validation_loss(self, epoch: int) -> Tuple[float, float, int]:
                 # count those batches for which we actually have a loss.  If this variable ever
                 # gets used for something else, we might need to change things around a bit.
                 batches_this_epoch += 1
-                val_loss += loss.detach().cpu().numpy()
+                val_batch_loss = loss.detach().cpu().numpy()
+                val_loss += val_batch_loss
                 if reg_loss is not None:
-                    val_reg_loss += reg_loss.detach().cpu().numpy()
+                    val_batch_reg_loss = reg_loss.detach().cpu().numpy()
+                    val_reg_loss += val_batch_reg_loss
 
             # Update the description with the latest metrics
             val_metrics = training_util.get_metrics(
                 self.model,
                 val_loss,
                 val_reg_loss,
+                val_batch_loss,
+                val_batch_reg_loss,
                 batches_this_epoch,
                 world_size=self._world_size,
                 cuda_device=self.cuda_device,
@@ -852,7 +868,9 @@ def train(self) -> Dict[str, Any]:
                         self.model,
                         val_loss,
                         val_reg_loss,
-                        num_batches,
+                        batch_loss=None,
+                        batch_reg_loss=None,
+                        num_batches=num_batches,
                         reset=True,
                         world_size=self._world_size,
                         cuda_device=self.cuda_device,

diff --git a/allennlp/training/util.py b/allennlp/training/util.py
@@ -273,6 +273,8 @@ def get_metrics(
     model: Model,
     total_loss: float,
     total_reg_loss: Optional[float],
+    batch_loss: Optional[float],
+    batch_reg_loss: Optional[float],
     num_batches: int,
     reset: bool = False,
     world_size: int = 1,
@@ -285,12 +287,12 @@ def get_metrics(
     Returns the `"batch_loss"` separately.
     """
     metrics = model.get_metrics(reset=reset)
-    if not reset:
-        metrics["batch_loss"] = total_loss
+    if batch_loss is not None:
+        metrics["batch_loss"] = batch_loss
     metrics["loss"] = float(total_loss / num_batches) if num_batches > 0 else 0.0
     if total_reg_loss is not None:
-        if not reset:
-            metrics["batch_reg_loss"] = total_reg_loss
+        if batch_reg_loss is not None:
+            metrics["batch_reg_loss"] = batch_reg_loss
         metrics["reg_loss"] = float(total_reg_loss / num_batches) if num_batches > 0 else 0.0
 
     if world_size > 1:

diff --git a/tests/data/dataloader_test.py b/tests/data/dataloader_test.py
@@ -5,20 +5,7 @@
 from allennlp.data.fields import LabelField
 from allennlp.data.instance import Instance
 from allennlp.data.dataloader import PyTorchDataLoader
-from allennlp.data.dataset_readers.dataset_reader import (
-    DatasetReader,
-    AllennlpLazyDataset,
-)
-
-
-def test_multi_processing_with_lazy_dataset_warns():
-    def fake_instance_generator(file_name: str) -> Iterable[Instance]:
-        yield from []
-
-    with pytest.warns(UserWarning, match=r".*deadlocks.*"):
-        PyTorchDataLoader(
-            AllennlpLazyDataset(fake_instance_generator, "nonexistent_file"), num_workers=1
-        )
+from allennlp.data.dataset_readers.dataset_reader import DatasetReader
 
 
 @pytest.mark.parametrize("lazy", (True, False))

diff --git a/tests/training/trainer_test.py b/tests/training/trainer_test.py
@@ -990,6 +990,43 @@ def __call__(
         expected_calls = [epoch for epoch in range(-1, 4)]
         assert trainer.epoch_callback_calls == expected_calls
 
+    def test_total_loss_is_average_of_batch_loss(self):
+
+        batches_per_epoch = 3
+
+        data_loader_custom_epoch_lazy = PyTorchDataLoader(
+            self.instances_lazy,
+            batch_size=2,
+            collate_fn=allennlp_collate,
+            batches_per_epoch=batches_per_epoch,
+        )
+
+        class FakeBatchCallback(BatchCallback):
+            def __call__(
+                self,
+                trainer: "GradientDescentTrainer",
+                batch_inputs: List[List[TensorDict]],
+                batch_outputs: List[Dict[str, Any]],
+                epoch: int,
+                batch_number: int,
+                is_training: bool,
+                is_master: bool,
+            ) -> None:
+                if not hasattr(trainer, "batch_losses"):
+                    trainer.batch_losses = []  # type: ignore
+                trainer.batch_losses.append(batch_outputs[0]["loss"].item())  # type: ignore
+
+        trainer = GradientDescentTrainer(
+            self.model,
+            self.optimizer,
+            data_loader_custom_epoch_lazy,
+            num_epochs=1,
+            batch_callbacks=[FakeBatchCallback()],
+        )
+        metrics = trainer.train()
+
+        assert metrics["training_loss"] == float(sum(trainer.batch_losses) / batches_per_epoch)
+
 
 class TestApexTrainer(TrainerTestBase):
     @requires_gpu

diff --git a/tests/training/util_test.py b/tests/training/util_test.py
@@ -170,14 +170,15 @@ def forward(self, **kwargs):
                 return {}
 
         model = FakeModel(None)
-        loss = 10.0
+        total_loss = 100.0
+        batch_loss = 10.0
         num_batches = 2
-        metrics = get_metrics(model, loss, None, num_batches)
+        metrics = get_metrics(model, total_loss, None, batch_loss, None, num_batches)
 
-        assert metrics["loss"] == float(loss / num_batches)
-        assert metrics["batch_loss"] == loss
+        assert metrics["loss"] == float(total_loss / num_batches)
+        assert metrics["batch_loss"] == batch_loss
 
-        metrics = get_metrics(model, loss, None, num_batches, reset=True)
+        metrics = get_metrics(model, total_loss, None, None, None, num_batches)
 
-        assert metrics["loss"] == float(loss / num_batches)
+        assert metrics["loss"] == float(total_loss / num_batches)
         assert "batch_loss" not in metrics