From c2c69dd84a8d0476e0601545e5795db459a36bba Mon Sep 17 00:00:00 2001
From: Mark Neumann <markng@allenai.org>
Date: Thu, 12 Dec 2019 16:18:56 -0800
Subject: [PATCH 01/15] strip out old DP stuff, ensure multiple cuda devices
 raises errors

---
 allennlp/commands/train.py                    | 21 ++++---
 .../tests/training/callback_trainer_test.py   | 57 ++++++-------------
 allennlp/tests/training/trainer_test.py       | 56 ++++++------------
 allennlp/training/callback_trainer.py         | 17 +++---
 allennlp/training/trainer.py                  | 19 +++----
 allennlp/training/trainer_base.py             | 24 ++++----
 allennlp/training/util.py                     | 30 ----------
 7 files changed, 75 insertions(+), 149 deletions(-)

diff --git a/allennlp/commands/train.py b/allennlp/commands/train.py
index e846012e9da..b568c7a22b8 100644
--- a/allennlp/commands/train.py
+++ b/allennlp/commands/train.py
@@ -270,10 +270,15 @@ def train_model(
     params.to_file(os.path.join(serialization_dir, CONFIG_NAME))
 
     cuda_device = params.params.get("trainer").get("cuda_device", -1)
+    device_id = parse_cuda_device(cuda_device)
     check_for_gpu(cuda_device)
 
+    multi_device = isinstance(device_id, list)
     distributed = params.params.get("trainer").get("distributed", False)
-    if not distributed:
+
+    # If distributed isn't in the config and the config contains strictly
+    # one cuda device, we just run a single training process.
+    if not distributed or not multi_device:
         model = _train_worker(
             process_rank=0,
             params=params,
@@ -286,14 +291,16 @@ def train_model(
         )
         archive_model(serialization_dir, files_to_archive=params.files_to_archive)
         return model
-    else:
-        device_id = parse_cuda_device(cuda_device)
 
-        if not isinstance(device_id, list):
-            raise ConfigurationError(
-                "Multiple cuda devices need to be configured to run distributed training."
-            )
+    # If the config contains the distributed flag, but only one GPU, we raise an error,
+    # because this combination is probably a mistake.
+    elif distributed and not multi_device:
+        raise ConfigurationError(
+            "Multiple cuda devices need to be configured to run distributed training."
+        )
 
+    # Otherwise, we are running multiple processes for training.
+    else:
         master_addr = params.params.get("trainer").pop("master_address", "127.0.0.1")
         master_port = params.params.get("trainer").pop("master_port", 29500)
         num_procs = len(device_id)
diff --git a/allennlp/tests/training/callback_trainer_test.py b/allennlp/tests/training/callback_trainer_test.py
index 5578d2ffc5a..132f0aa5b85 100644
--- a/allennlp/tests/training/callback_trainer_test.py
+++ b/allennlp/tests/training/callback_trainer_test.py
@@ -262,52 +262,29 @@ def test_trainer_can_run_cuda(self):
             callbacks=self.default_callbacks(),
             cuda_device=0,
         )
-        trainer.train()
-
-    @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="Need multiple GPUs.")
-    def test_trainer_can_run_multiple_gpu(self):
-        self.model.cuda()
-
-        class MetaDataCheckWrapper(Model):
-            """
-            Checks that the metadata field has been correctly split across the batch dimension
-            when running on multiple gpus.
-            """
-
-            def __init__(self, model):
-                super().__init__(model.vocab)
-                self.model = model
-
-            def forward(self, **kwargs) -> Dict[str, torch.Tensor]:  # type: ignore
-                assert (
-                    "metadata" in kwargs and "tags" in kwargs
-                ), f"tokens and metadata must be provided. Got {kwargs.keys()} instead."
-                batch_size = kwargs["tokens"]["tokens"].size()[0]
-                assert len(kwargs["metadata"]) == batch_size, (
-                    f"metadata must be split appropriately. Expected {batch_size} elements, "
-                    f"got {len(kwargs['metadata'])} elements."
-                )
-                return self.model.forward(**kwargs)
-
-        multigpu_iterator = BasicIterator(batch_size=4)
-        multigpu_iterator.index_with(self.vocab)
-        trainer = CallbackTrainer(
-            MetaDataCheckWrapper(self.model),
-            training_data=self.instances,
-            iterator=multigpu_iterator,
-            optimizer=self.optimizer,
-            num_epochs=2,
-            callbacks=self.default_callbacks(),
-            cuda_device=[0, 1],
-        )
         metrics = trainer.train()
         assert "peak_cpu_memory_MB" in metrics
         assert isinstance(metrics["peak_cpu_memory_MB"], float)
         assert metrics["peak_cpu_memory_MB"] > 0
         assert "peak_gpu_0_memory_MB" in metrics
         assert isinstance(metrics["peak_gpu_0_memory_MB"], int)
-        assert "peak_gpu_1_memory_MB" in metrics
-        assert isinstance(metrics["peak_gpu_1_memory_MB"], int)
+
+
+    def test_passing_trainer_multiple_gpus_raises_error(self):
+        self.model.cuda()
+
+        multigpu_iterator = BasicIterator(batch_size=4)
+        multigpu_iterator.index_with(self.vocab)
+        with pytest.raises(ConfigurationError):
+            trainer = CallbackTrainer(
+                self.model,
+                training_data=self.instances,
+                iterator=multigpu_iterator,
+                optimizer=self.optimizer,
+                num_epochs=2,
+                callbacks=self.default_callbacks(),
+                cuda_device=[0, 1],
+            )
 
     def test_trainer_can_resume_training(self):
         trainer = CallbackTrainer(
diff --git a/allennlp/tests/training/trainer_test.py b/allennlp/tests/training/trainer_test.py
index c1774d9a770..3ecc36481c3 100644
--- a/allennlp/tests/training/trainer_test.py
+++ b/allennlp/tests/training/trainer_test.py
@@ -107,51 +107,29 @@ def test_trainer_can_run_cuda(self):
         trainer = Trainer(
             self.model, self.optimizer, self.iterator, self.instances, num_epochs=2, cuda_device=0
         )
-        trainer.train()
-
-    @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="Need multiple GPUs.")
-    def test_trainer_can_run_multiple_gpu(self):
-        self.model.cuda()
-
-        class MetaDataCheckWrapper(Model):
-            """
-            Checks that the metadata field has been correctly split across the batch dimension
-            when running on multiple gpus.
-            """
-
-            def __init__(self, model):
-                super().__init__(model.vocab)
-                self.model = model
-
-            def forward(self, **kwargs) -> Dict[str, torch.Tensor]:  # type: ignore
-                assert (
-                    "metadata" in kwargs and "tags" in kwargs
-                ), f"tokens and metadata must be provided. Got {kwargs.keys()} instead."
-                batch_size = kwargs["tokens"]["tokens"].size()[0]
-                assert len(kwargs["metadata"]) == batch_size, (
-                    f"metadata must be split appropriately. Expected {batch_size} elements, "
-                    f"got {len(kwargs['metadata'])} elements."
-                )
-                return self.model.forward(**kwargs)
-
-        multigpu_iterator = BasicIterator(batch_size=4)
-        multigpu_iterator.index_with(self.vocab)
-        trainer = Trainer(
-            MetaDataCheckWrapper(self.model),
-            self.optimizer,
-            multigpu_iterator,
-            self.instances,
-            num_epochs=2,
-            cuda_device=[0, 1],
-        )
         metrics = trainer.train()
         assert "peak_cpu_memory_MB" in metrics
         assert isinstance(metrics["peak_cpu_memory_MB"], float)
         assert metrics["peak_cpu_memory_MB"] > 0
         assert "peak_gpu_0_memory_MB" in metrics
         assert isinstance(metrics["peak_gpu_0_memory_MB"], int)
-        assert "peak_gpu_1_memory_MB" in metrics
-        assert isinstance(metrics["peak_gpu_1_memory_MB"], int)
+
+
+    @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="Need multiple GPUs.")
+    def test_passing_trainer_multiple_gpus_raises_error(self):
+        self.model.cuda()
+
+        multigpu_iterator = BasicIterator(batch_size=4)
+        multigpu_iterator.index_with(self.vocab)
+        with pytest.raises(ConfigurationError):
+            trainer = Trainer(
+                self.model,
+                self.optimizer,
+                multigpu_iterator,
+                self.instances,
+                num_epochs=2,
+                cuda_device=[0, 1],
+            )
 
     def test_trainer_can_resume_training(self):
         trainer = Trainer(
diff --git a/allennlp/training/callback_trainer.py b/allennlp/training/callback_trainer.py
index 2226a6f90bf..c7d11cd6332 100644
--- a/allennlp/training/callback_trainer.py
+++ b/allennlp/training/callback_trainer.py
@@ -55,7 +55,7 @@ def __init__(
         num_epochs: int = 20,
         shuffle: bool = True,
         serialization_dir: Optional[str] = None,
-        cuda_device: Union[int, List] = -1,
+        cuda_device: int = -1,
         callbacks: List[Callback] = None,
         distributed: bool = False,
         rank: int = 0,
@@ -96,8 +96,10 @@ def __init__(
         serialization_dir : str, optional (default=None)
             Path to directory for saving and loading model files. Models will not be saved if
             this parameter is not passed.
-        cuda_device : ``Union[int, List[int]]``, optional (default=-1)
+        cuda_device : ``int``, optional (default=-1)
             An integer or list of integers specifying the CUDA device(s) to use. If -1, the CPU is used.
+            Data parallelism is controlled at the allennlp train level, so each trainer will have a single
+            GPU.
         callbacks : ``List[Callback]``, optional (default=None)
             A list of callbacks that will be called based on training events.
         """
@@ -181,13 +183,10 @@ def batch_loss(self, batch_group: List[TensorDict], for_training: bool) -> torch
         This is a method on the trainer so that it can be used both in training and validation
         (which are handled separately).
         """
-        if self._multiple_gpu:
-            output_dict = training_util.data_parallel(batch_group, self.model, self._cuda_devices)
-        else:
-            assert len(batch_group) == 1
-            batch = batch_group[0]
-            batch = nn_util.move_to_device(batch, self._cuda_devices[0])
-            output_dict = self._pytorch_model(**batch)
+        assert len(batch_group) == 1
+        batch = batch_group[0]
+        batch = nn_util.move_to_device(batch, self._cuda_devices[0])
+        output_dict = self._pytorch_model(**batch)
 
         try:
             loss = output_dict["loss"]
diff --git a/allennlp/training/trainer.py b/allennlp/training/trainer.py
index 900e66b6aa1..2cfdc4fb70c 100644
--- a/allennlp/training/trainer.py
+++ b/allennlp/training/trainer.py
@@ -51,7 +51,7 @@ def __init__(
         keep_serialized_model_every_num_seconds: int = None,
         checkpointer: Checkpointer = None,
         model_save_interval: float = None,
-        cuda_device: Union[int, List] = -1,
+        cuda_device: int = -1,
         grad_norm: Optional[float] = None,
         grad_clipping: Optional[float] = None,
         learning_rate_scheduler: Optional[LearningRateScheduler] = None,
@@ -128,8 +128,10 @@ def __init__(
             If provided, then serialize models every ``model_save_interval``
             seconds within single epochs.  In all cases, models are also saved
             at the end of every epoch if ``serialization_dir`` is provided.
-        cuda_device : ``Union[int, List[int]]``, optional (default = -1)
-            An integer or list of integers specifying the CUDA device(s) to use. If -1, the CPU is used.
+        cuda_device : ``int``, optional (default = -1)
+            An integer specifying the CUDA device(s) to use for this process. If -1, the CPU is used.
+            Data parallelism is controlled at the allennlp train level, so each trainer will have a single
+            GPU.
         grad_norm : ``float``, optional, (default = None).
             If provided, gradient norms will be rescaled to have a maximum of this value.
         grad_clipping : ``float``, optional (default = ``None``).
@@ -287,13 +289,10 @@ def batch_loss(self, batch_group: List[TensorDict], for_training: bool) -> torch
         Does a forward pass on the given batches and returns the ``loss`` value in the result.
         If ``for_training`` is `True` also applies regularization penalty.
         """
-        if self._multiple_gpu:
-            output_dict = training_util.data_parallel(batch_group, self.model, self._cuda_devices)
-        else:
-            assert len(batch_group) == 1
-            batch = batch_group[0]
-            batch = nn_util.move_to_device(batch, self._cuda_devices[0])
-            output_dict = self._pytorch_model(**batch)
+        assert len(batch_group) == 1
+        batch = batch_group[0]
+        batch = nn_util.move_to_device(batch, self._cuda_devices[0])
+        output_dict = self._pytorch_model(**batch)
 
         try:
             loss = output_dict["loss"]
diff --git a/allennlp/training/trainer_base.py b/allennlp/training/trainer_base.py
index 6a3c9b2c467..32405b2fda6 100644
--- a/allennlp/training/trainer_base.py
+++ b/allennlp/training/trainer_base.py
@@ -40,8 +40,14 @@ def __init__(
 
         self._serialization_dir = serialization_dir
 
-        # Configure GPUs:
-        if not isinstance(cuda_device, int) and not isinstance(cuda_device, list):
+        if isinstance(cuda_device, list):
+            raise ConfigurationError(
+                "In allennlp 1.0, the Trainer can only be assigned a single `cuda_device`. "
+                "Instead, we use torch's DistributedDataParallel at the command level, meaning "
+                "our Trainer always uses a single GPU per process."
+            )
+
+        if not isinstance(cuda_device, int):
             raise ConfigurationError(
                 "Expected an int or list for cuda_device, got {}".format(cuda_device)
             )
@@ -52,18 +58,8 @@ def __init__(
                 "`cuda_device` key in the experiment configuration."
             )
 
-        if isinstance(cuda_device, list):
-            # For distributed training, every trainer worker is only assigned with a single GPU
-            if distributed:
-                raise ConfigurationError(
-                    "Distributed worker can only be assigned a single `cuda_device`."
-                )
-
-            self._multiple_gpu = True
-            self._cuda_devices = cuda_device
-        else:
-            self._multiple_gpu = False
-            self._cuda_devices = [cuda_device]
+        self._multiple_gpu = False
+        self._cuda_devices = [cuda_device]
 
         self._distributed = distributed
         self._rank = rank
diff --git a/allennlp/training/util.py b/allennlp/training/util.py
index 4ebc52a23fa..a7bf3c3f78a 100644
--- a/allennlp/training/util.py
+++ b/allennlp/training/util.py
@@ -328,36 +328,6 @@ def create_serialization_dir(
         os.makedirs(serialization_dir, exist_ok=True)
 
 
-def data_parallel(
-    batch_group: List[TensorDict], model: Model, cuda_devices: List
-) -> Dict[str, torch.Tensor]:
-    """
-    Performs a forward pass using multiple GPUs.  This is a simplification
-    of torch.nn.parallel.data_parallel to support the allennlp model
-    interface.
-    """
-    assert len(batch_group) <= len(cuda_devices)
-
-    moved = [
-        nn_util.move_to_device(batch, device) for batch, device in zip(batch_group, cuda_devices)
-    ]
-
-    used_device_ids = cuda_devices[: len(moved)]
-    # Counterintuitively, it appears replicate expects the source device id to be the first element
-    # in the device id list. See torch.cuda.comm.broadcast_coalesced, which is called indirectly.
-    replicas = replicate(model, used_device_ids)
-
-    # We pass all our arguments as kwargs. Create a list of empty tuples of the
-    # correct shape to serve as (non-existent) positional arguments.
-    inputs = [()] * len(batch_group)
-    outputs = parallel_apply(replicas, inputs, moved, used_device_ids)
-
-    # Only the 'loss' is needed.
-    # a (num_gpu, ) tensor with loss on each GPU
-    losses = gather([output["loss"].unsqueeze(0) for output in outputs], used_device_ids[0], 0)
-    return {"loss": losses.mean()}
-
-
 def enable_gradient_clipping(model: Model, grad_clipping: Optional[float]) -> None:
     if grad_clipping is not None:
         for parameter in model.parameters():

From 4afc9c54e49f1b0270e6f9d3e113509dd9a865a1 Mon Sep 17 00:00:00 2001
From: Mark Neumann <markng@allenai.org>
Date: Thu, 12 Dec 2019 16:40:35 -0800
Subject: [PATCH 02/15] lint

---
 allennlp/tests/training/callback_trainer_test.py | 3 +--
 allennlp/tests/training/trainer_test.py          | 4 +---
 allennlp/training/callback_trainer.py            | 2 +-
 allennlp/training/util.py                        | 3 ---
 4 files changed, 3 insertions(+), 9 deletions(-)

diff --git a/allennlp/tests/training/callback_trainer_test.py b/allennlp/tests/training/callback_trainer_test.py
index 132f0aa5b85..2d9229d806f 100644
--- a/allennlp/tests/training/callback_trainer_test.py
+++ b/allennlp/tests/training/callback_trainer_test.py
@@ -269,14 +269,13 @@ def test_trainer_can_run_cuda(self):
         assert "peak_gpu_0_memory_MB" in metrics
         assert isinstance(metrics["peak_gpu_0_memory_MB"], int)
 
-
     def test_passing_trainer_multiple_gpus_raises_error(self):
         self.model.cuda()
 
         multigpu_iterator = BasicIterator(batch_size=4)
         multigpu_iterator.index_with(self.vocab)
         with pytest.raises(ConfigurationError):
-            trainer = CallbackTrainer(
+            CallbackTrainer(
                 self.model,
                 training_data=self.instances,
                 iterator=multigpu_iterator,
diff --git a/allennlp/tests/training/trainer_test.py b/allennlp/tests/training/trainer_test.py
index 3ecc36481c3..1a182e4630f 100644
--- a/allennlp/tests/training/trainer_test.py
+++ b/allennlp/tests/training/trainer_test.py
@@ -114,15 +114,13 @@ def test_trainer_can_run_cuda(self):
         assert "peak_gpu_0_memory_MB" in metrics
         assert isinstance(metrics["peak_gpu_0_memory_MB"], int)
 
-
-    @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="Need multiple GPUs.")
     def test_passing_trainer_multiple_gpus_raises_error(self):
         self.model.cuda()
 
         multigpu_iterator = BasicIterator(batch_size=4)
         multigpu_iterator.index_with(self.vocab)
         with pytest.raises(ConfigurationError):
-            trainer = Trainer(
+            Trainer(
                 self.model,
                 self.optimizer,
                 multigpu_iterator,
diff --git a/allennlp/training/callback_trainer.py b/allennlp/training/callback_trainer.py
index c7d11cd6332..be0528a760e 100644
--- a/allennlp/training/callback_trainer.py
+++ b/allennlp/training/callback_trainer.py
@@ -7,7 +7,7 @@
 import datetime
 import functools
 import math
-from typing import Dict, Optional, List, Union, Any, Iterable
+from typing import Dict, Optional, List, Any, Iterable
 import torch
 
 from allennlp.common import Params
diff --git a/allennlp/training/util.py b/allennlp/training/util.py
index a7bf3c3f78a..65283b87ae3 100644
--- a/allennlp/training/util.py
+++ b/allennlp/training/util.py
@@ -11,8 +11,6 @@
 import shutil
 
 import torch
-from torch.nn.parallel import replicate, parallel_apply
-from torch.nn.parallel.scatter_gather import gather
 
 from allennlp.common.checks import ConfigurationError, check_for_gpu
 from allennlp.common.params import Params
@@ -20,7 +18,6 @@
 from allennlp.data.dataset_readers import DatasetReader
 from allennlp.data import Instance
 from allennlp.data.iterators import DataIterator
-from allennlp.data.iterators.data_iterator import TensorDict
 from allennlp.models.model import Model
 from allennlp.models.archival import CONFIG_NAME
 from allennlp.nn import util as nn_util

From 80803016ef38d1dc152dba1b51e05946d835073d Mon Sep 17 00:00:00 2001
From: Mark Neumann <markng@allenai.org>
Date: Fri, 13 Dec 2019 09:39:14 -0800
Subject: [PATCH 03/15] remove unused attribute

---
 allennlp/training/trainer_base.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/allennlp/training/trainer_base.py b/allennlp/training/trainer_base.py
index 32405b2fda6..72077e615d4 100644
--- a/allennlp/training/trainer_base.py
+++ b/allennlp/training/trainer_base.py
@@ -58,7 +58,6 @@ def __init__(
                 "`cuda_device` key in the experiment configuration."
             )
 
-        self._multiple_gpu = False
         self._cuda_devices = [cuda_device]
 
         self._distributed = distributed

From 278c89d0cea193bc5de368deb4f7353003df83e3 Mon Sep 17 00:00:00 2001
From: Mark Neumann <markng@allenai.org>
Date: Fri, 13 Dec 2019 10:15:43 -0800
Subject: [PATCH 04/15] remove _cuda_devices everywhere

---
 allennlp/commands/find_learning_rate.py | 16 +++++++++-----
 allennlp/commands/fine_tune.py          |  2 +-
 allennlp/commands/train.py              |  2 +-
 allennlp/training/callback_trainer.py   | 12 ++++-------
 allennlp/training/callbacks/validate.py |  9 ++------
 allennlp/training/trainer.py            | 28 +++++++++----------------
 allennlp/training/trainer_base.py       |  6 +++---
 7 files changed, 32 insertions(+), 43 deletions(-)

diff --git a/allennlp/commands/find_learning_rate.py b/allennlp/commands/find_learning_rate.py
index c778ac733a1..b0347b2d5aa 100644
--- a/allennlp/commands/find_learning_rate.py
+++ b/allennlp/commands/find_learning_rate.py
@@ -52,7 +52,7 @@
 import shutil
 
 from allennlp.commands.subcommand import Subcommand
-from allennlp.common.checks import ConfigurationError, check_for_gpu
+from allennlp.common.checks import ConfigurationError, check_for_gpu, parse_cuda_device
 from allennlp.common import Params, Tqdm
 from allennlp.common.util import prepare_environment, lazy_groups_of
 from allennlp.data import Vocabulary, DataIterator
@@ -193,6 +193,14 @@ def find_learning_rate_model(
     prepare_environment(params)
 
     cuda_device = params.params.get("trainer").get("cuda_device", -1)
+    devices = parse_cuda_device(cuda_device)
+
+    # HACK: The trainer can not be constructed with multiple gpus.
+    # TODO(Mark): rework this so that cuda devices for distributed training are passed
+    # somewhere else, so configs are always valid.
+    if isinstance(devices, list):
+        cuda_device = devices[0]
+        params.params["trainer"]["cuda_device"] = cuda_device
     check_for_gpu(cuda_device)
 
     all_datasets = datasets_from_params(params)
@@ -223,6 +231,7 @@ def find_learning_rate_model(
     train_data = all_datasets["train"]
 
     trainer_params = params.pop("trainer")
+
     no_grad_regexes = trainer_params.pop("no_grad", ())
     for name, parameter in model.named_parameters():
         if any(re.search(regex, name) for regex in no_grad_regexes):
@@ -296,10 +305,7 @@ def search_learning_rate(
 
     trainer.model.train()
 
-    num_gpus = len(trainer._cuda_devices)
-
-    raw_train_generator = trainer.iterator(trainer.train_data, shuffle=trainer.shuffle)
-    train_generator = lazy_groups_of(raw_train_generator, num_gpus)
+    train_generator = trainer.iterator(trainer.train_data, shuffle=trainer.shuffle)
     train_generator_tqdm = Tqdm.tqdm(train_generator, total=num_batches)
 
     learning_rates = []
diff --git a/allennlp/commands/fine_tune.py b/allennlp/commands/fine_tune.py
index 46ff0d56d3f..11882c14f55 100644
--- a/allennlp/commands/fine_tune.py
+++ b/allennlp/commands/fine_tune.py
@@ -382,7 +382,7 @@ def fine_tune_model(
             model,
             test_data,
             validation_iterator or iterator,
-            cuda_device=trainer._cuda_devices[0],
+            cuda_device=trainer.cuda_device,
             batch_weight_key=batch_weight_key,
         )
 
diff --git a/allennlp/commands/train.py b/allennlp/commands/train.py
index b568c7a22b8..443ce4accd0 100644
--- a/allennlp/commands/train.py
+++ b/allennlp/commands/train.py
@@ -511,7 +511,7 @@ def _train_worker(
                 trainer.model,
                 evaluation_dataset,
                 evaluation_iterator,
-                cuda_device=trainer._cuda_devices[0],  # pylint: disable=protected-access,
+                cuda_device=trainer.cuda_device,
                 # TODO(brendanr): Pass in an arg following Joel's trainer refactor.
                 batch_weight_key="",
             )
diff --git a/allennlp/training/callback_trainer.py b/allennlp/training/callback_trainer.py
index be0528a760e..e1e9156ba0b 100644
--- a/allennlp/training/callback_trainer.py
+++ b/allennlp/training/callback_trainer.py
@@ -167,13 +167,9 @@ def generate_training_batches(self):
         Generates one epoch worth of training data. Stores it in trainer instance variables
         so that callbacks can access it.
         """
-        num_gpus = len(self._cuda_devices)
-
-        raw_train_generator = self.iterator(self.training_data, num_epochs=1, shuffle=self.shuffle)
-        self.training_batches = lazy_groups_of(raw_train_generator, num_gpus)
-        self.num_training_batches = math.ceil(
-            self.iterator.get_num_batches(self.training_data) / num_gpus
-        )
+        train_generator = self.iterator(self.training_data, num_epochs=1, shuffle=self.shuffle)
+        self.training_batches = train_generator
+        self.num_training_batches = self.iterator.get_num_batches(self.training_data)
 
     def batch_loss(self, batch_group: List[TensorDict], for_training: bool) -> torch.Tensor:
         """
@@ -185,7 +181,7 @@ def batch_loss(self, batch_group: List[TensorDict], for_training: bool) -> torch
         """
         assert len(batch_group) == 1
         batch = batch_group[0]
-        batch = nn_util.move_to_device(batch, self._cuda_devices[0])
+        batch = nn_util.move_to_device(batch, self.cuda_device)
         output_dict = self._pytorch_model(**batch)
 
         try:
diff --git a/allennlp/training/callbacks/validate.py b/allennlp/training/callbacks/validate.py
index 564cffd4bdc..973164f071f 100644
--- a/allennlp/training/callbacks/validate.py
+++ b/allennlp/training/callbacks/validate.py
@@ -67,13 +67,8 @@ def validate(self, trainer: "CallbackTrainer"):
 
             trainer.model.eval()
 
-            num_gpus = len(trainer._cuda_devices)
-
-            raw_val_generator = self.iterator(self.instances, num_epochs=1, shuffle=False)
-            val_generator = lazy_groups_of(raw_val_generator, num_gpus)
-            num_validation_batches = math.ceil(
-                self.iterator.get_num_batches(self.instances) / num_gpus
-            )
+            val_generator = self.iterator(self.instances, num_epochs=1, shuffle=False)
+            num_validation_batches = self.iterator.get_num_batches(self.instances)
             val_generator_tqdm = Tqdm.tqdm(val_generator, total=num_validation_batches)
 
             batches_this_epoch = 0
diff --git a/allennlp/training/trainer.py b/allennlp/training/trainer.py
index 2cfdc4fb70c..fa5790b74c8 100644
--- a/allennlp/training/trainer.py
+++ b/allennlp/training/trainer.py
@@ -277,7 +277,7 @@ def __init__(
         # normal case, reference to `Model` is retained. This reference is only used in
         # these places: `model.__call__`, `model.train` and `model.eval`.
         if self._distributed:
-            self._pytorch_model = DistributedDataParallel(self.model, device_ids=self._cuda_devices)
+            self._pytorch_model = DistributedDataParallel(self.model, device_ids=[self.cuda_device])
         else:
             self._pytorch_model = self.model
 
@@ -291,7 +291,7 @@ def batch_loss(self, batch_group: List[TensorDict], for_training: bool) -> torch
         """
         assert len(batch_group) == 1
         batch = batch_group[0]
-        batch = nn_util.move_to_device(batch, self._cuda_devices[0])
+        batch = nn_util.move_to_device(batch, self.cuda_device)
         output_dict = self._pytorch_model(**batch)
 
         try:
@@ -324,12 +324,9 @@ def _train_epoch(self, epoch: int) -> Dict[str, float]:
         # Set the model to "train" mode.
         self._pytorch_model.train()
 
-        num_gpus = len(self._cuda_devices)
-
         # Get tqdm for the training batches
-        raw_train_generator = self.iterator(self.train_data, num_epochs=1, shuffle=self.shuffle)
-        train_generator = lazy_groups_of(raw_train_generator, num_gpus)
-        num_training_batches = math.ceil(self.iterator.get_num_batches(self.train_data) / num_gpus)
+        train_generator = self.iterator(self.train_data, num_epochs=1, shuffle=self.shuffle)
+        num_training_batches = self.iterator.get_num_batches(self.train_data)
         self._last_log = time.time()
         last_save_time = time.time()
 
@@ -403,7 +400,7 @@ def _train_epoch(self, epoch: int) -> Dict[str, float]:
                 train_loss,
                 batches_this_epoch,
                 world_size=self._world_size,
-                cuda_device=self._cuda_devices,
+                cuda_device=[self.cuda_device],
             )
 
             # Updating tqdm only for the master as the trainers wouldn't have one
@@ -447,7 +444,7 @@ def _train_epoch(self, epoch: int) -> Dict[str, float]:
             batches_this_epoch,
             reset=True,
             world_size=self._world_size,
-            cuda_device=self._cuda_devices,
+            cuda_device=[self.cuda_device],
         )
         metrics["cpu_memory_MB"] = peak_cpu_usage
         for (gpu_num, memory) in gpu_usage:
@@ -471,13 +468,8 @@ def _validation_loss(self) -> Tuple[float, int]:
         else:
             val_iterator = self.iterator
 
-        num_gpus = len(self._cuda_devices)
-
-        raw_val_generator = val_iterator(self._validation_data, num_epochs=1, shuffle=False)
-        val_generator = lazy_groups_of(raw_val_generator, num_gpus)
-        num_validation_batches = math.ceil(
-            val_iterator.get_num_batches(self._validation_data) / num_gpus
-        )
+        val_generator = val_iterator(self._validation_data, num_epochs=1, shuffle=False)
+        num_validation_batches = val_iterator.get_num_batches(self._validation_data)
         val_generator_tqdm = Tqdm.tqdm(val_generator, total=num_validation_batches)
         batches_this_epoch = 0
         val_loss = 0
@@ -499,7 +491,7 @@ def _validation_loss(self) -> Tuple[float, int]:
                 val_loss,
                 batches_this_epoch,
                 world_size=self._world_size,
-                cuda_device=self._cuda_devices,
+                cuda_device=[self.cuda_device],
             )
             description = training_util.description_from_metrics(val_metrics)
             val_generator_tqdm.set_description(description, refresh=False)
@@ -572,7 +564,7 @@ def train(self) -> Dict[str, Any]:
                         num_batches,
                         reset=True,
                         world_size=self._world_size,
-                        cuda_device=self._cuda_devices,
+                        cuda_device=[self.cuda_device],
                     )
 
                     # Check validation metric for early stopping
diff --git a/allennlp/training/trainer_base.py b/allennlp/training/trainer_base.py
index 72077e615d4..31689c19d97 100644
--- a/allennlp/training/trainer_base.py
+++ b/allennlp/training/trainer_base.py
@@ -58,7 +58,7 @@ def __init__(
                 "`cuda_device` key in the experiment configuration."
             )
 
-        self._cuda_devices = [cuda_device]
+        self.cuda_device = cuda_device
 
         self._distributed = distributed
         self._rank = rank
@@ -66,8 +66,8 @@ def __init__(
         self._world_size = world_size
 
     def _move_to_gpu(self, model: Model) -> Model:
-        if self._cuda_devices[0] != -1:
-            return model.cuda(self._cuda_devices[0])
+        if self.cuda_device != -1:
+            return model.cuda(self.cuda_device)
         else:
             return model
 

From 9a74ab76613ca2aa87439e2d0f346a5cec8965d1 Mon Sep 17 00:00:00 2001
From: Mark Neumann <markng@allenai.org>
Date: Fri, 13 Dec 2019 11:49:49 -0800
Subject: [PATCH 05/15] fixes

---
 allennlp/training/callback_trainer.py | 14 ++++++--------
 allennlp/training/trainer.py          |  4 +---
 2 files changed, 7 insertions(+), 11 deletions(-)

diff --git a/allennlp/training/callback_trainer.py b/allennlp/training/callback_trainer.py
index e1e9156ba0b..201313a2336 100644
--- a/allennlp/training/callback_trainer.py
+++ b/allennlp/training/callback_trainer.py
@@ -171,7 +171,7 @@ def generate_training_batches(self):
         self.training_batches = train_generator
         self.num_training_batches = self.iterator.get_num_batches(self.training_data)
 
-    def batch_loss(self, batch_group: List[TensorDict], for_training: bool) -> torch.Tensor:
+    def batch_loss(self, batch: TensorDict, for_training: bool) -> torch.Tensor:
         """
         Does a forward pass on the given batches and returns the ``loss`` value in the result.
         If ``for_training`` is `True` also applies regularization penalty.
@@ -179,8 +179,6 @@ def batch_loss(self, batch_group: List[TensorDict], for_training: bool) -> torch
         This is a method on the trainer so that it can be used both in training and validation
         (which are handled separately).
         """
-        assert len(batch_group) == 1
-        batch = batch_group[0]
         batch = nn_util.move_to_device(batch, self.cuda_device)
         output_dict = self._pytorch_model(**batch)
 
@@ -198,7 +196,7 @@ def batch_loss(self, batch_group: List[TensorDict], for_training: bool) -> torch
 
         return loss
 
-    def train_one_batch_group(self, batch_group: List[TensorDict]) -> str:
+    def train_one_batch_group(self, batch: TensorDict) -> str:
         """
         Handles the training for a single batch group.
         Fires off the events BATCH_START, FORWARD, BACKWARD, and BATCH_END.
@@ -210,7 +208,7 @@ def train_one_batch_group(self, batch_group: List[TensorDict]) -> str:
         self.batch_num_total += 1
 
         self.handler.fire_event(Events.FORWARD)
-        loss = self.batch_loss(batch_group, for_training=True)
+        loss = self.batch_loss(batch, for_training=True)
 
         if torch.isnan(loss):
             raise ValueError("nan loss encountered")
@@ -248,11 +246,11 @@ def train_one_epoch(self) -> None:
         logger.info("Training")
         self.batches_this_epoch = 0
 
-        batch_groups_tqdm = Tqdm.tqdm(self.training_batches, total=self.num_training_batches)
+        batches_tqdm = Tqdm.tqdm(self.training_batches, total=self.num_training_batches)
 
-        for self.batch_group in batch_groups_tqdm:
+        for self.batch_group in batches_tqdm:
             description = self.train_one_batch_group(self.batch_group)
-            batch_groups_tqdm.set_description(description, refresh=False)
+            batches_tqdm.set_description(description, refresh=False)
 
         self.handler.fire_event(Events.VALIDATE)
         self.handler.fire_event(Events.EPOCH_END)
diff --git a/allennlp/training/trainer.py b/allennlp/training/trainer.py
index fa5790b74c8..8f444e57a22 100644
--- a/allennlp/training/trainer.py
+++ b/allennlp/training/trainer.py
@@ -284,13 +284,11 @@ def __init__(
     def rescale_gradients(self) -> Optional[float]:
         return training_util.rescale_gradients(self.model, self._grad_norm)
 
-    def batch_loss(self, batch_group: List[TensorDict], for_training: bool) -> torch.Tensor:
+    def batch_loss(self, batch: TensorDict, for_training: bool) -> torch.Tensor:
         """
         Does a forward pass on the given batches and returns the ``loss`` value in the result.
         If ``for_training`` is `True` also applies regularization penalty.
         """
-        assert len(batch_group) == 1
-        batch = batch_group[0]
         batch = nn_util.move_to_device(batch, self.cuda_device)
         output_dict = self._pytorch_model(**batch)
 

From bdc9a8a4fa6a2f04ee11120766c71670f5ac904c Mon Sep 17 00:00:00 2001
From: Mark Neumann <markng@allenai.org>
Date: Fri, 13 Dec 2019 12:48:20 -0800
Subject: [PATCH 06/15] move distributed config up to top level

---
 allennlp/commands/train.py            | 40 +++++++++++++--------------
 allennlp/tests/commands/train_test.py | 11 ++++----
 allennlp/training/callback_trainer.py | 11 +++-----
 allennlp/training/trainer.py          | 11 +++-----
 allennlp/training/trainer_base.py     |  2 +-
 5 files changed, 34 insertions(+), 41 deletions(-)

diff --git a/allennlp/commands/train.py b/allennlp/commands/train.py
index 443ce4accd0..26aa0002c14 100644
--- a/allennlp/commands/train.py
+++ b/allennlp/commands/train.py
@@ -269,16 +269,16 @@ def train_model(
     create_serialization_dir(params, serialization_dir, recover, force)
     params.to_file(os.path.join(serialization_dir, CONFIG_NAME))
 
-    cuda_device = params.params.get("trainer").get("cuda_device", -1)
-    device_id = parse_cuda_device(cuda_device)
+    cuda_device = params.params.pop("distributed_cuda_devices", -1)
+    device_ids = parse_cuda_device(cuda_device)
     check_for_gpu(cuda_device)
 
-    multi_device = isinstance(device_id, list)
-    distributed = params.params.get("trainer").get("distributed", False)
+    multi_device = isinstance(device_ids, list)
+    distributed = params.params.pop("distributed", False)
 
     # If distributed isn't in the config and the config contains strictly
     # one cuda device, we just run a single training process.
-    if not distributed or not multi_device:
+    if not distributed:
         model = _train_worker(
             process_rank=0,
             params=params,
@@ -301,10 +301,10 @@ def train_model(
 
     # Otherwise, we are running multiple processes for training.
     else:
-        master_addr = params.params.get("trainer").pop("master_address", "127.0.0.1")
-        master_port = params.params.get("trainer").pop("master_port", 29500)
-        num_procs = len(device_id)
-        num_nodes = params.params.get("trainer").pop("num_nodes", 1)
+        master_addr = params.params.pop("master_address", "127.0.0.1")
+        master_port = params.params.pop("master_port", 29500)
+        num_procs = len(device_ids)
+        num_nodes = params.params.pop("num_nodes", 1)
         world_size = num_nodes * num_procs
 
         os.environ["MASTER_ADDR"] = master_addr
@@ -339,10 +339,10 @@ def train_model(
                 cache_prefix,
                 include_package,
                 node_rank,
-                num_procs,
                 master_addr,
                 master_port,
                 world_size,
+                device_ids,
             ),
             nprocs=num_procs,
         )
@@ -360,10 +360,10 @@ def _train_worker(
     cache_prefix: str = None,
     include_package: List[str] = None,
     node_rank: int = 0,
-    num_procs_per_node: int = 0,
     master_addr: str = "127.0.0.1",
     master_port: int = 29500,
     world_size: int = 1,
+    distributed_device_ids: List[str] = None,
 ) -> Optional[Model]:
     """
     Helper to train the configured model/experiment. In distributed mode, this is spawned as a
@@ -422,18 +422,22 @@ def _train_worker(
             for package_name in include_package:
                 import_submodules(package_name)
 
+        num_procs_per_node = len(distributed_device_ids)
         # The Unique identifier of the worker process among all the processes in the
         # distributed training group is computed here. This is used while initializing
         # the process group using `init_process_group`
         global_rank = node_rank * num_procs_per_node + process_rank
 
-        cuda_device = params.params.get("trainer").get("cuda_device", -1)
-        device_list = parse_cuda_device(cuda_device)
-
         # In distributed training, the configured device is always going to be a list.
         # The corresponding gpu id for the particular worker is obtained by picking the id
         # from the device list with the rank as index
-        gpu_id = device_list[process_rank]  # type: ignore
+        gpu_id = distributed_device_ids[process_rank]  # type: ignore
+
+        # Till now, "cuda_device" might not be set in the trainer params.
+        # But a worker trainer needs to only know about its specific GPU id.
+        params["trainer"]["cuda_device"] = gpu_id
+        params["trainer"]["world_size"] = world_size
+
 
         torch.cuda.set_device(gpu_id)
         dist.init_process_group(
@@ -447,12 +451,6 @@ def _train_worker(
             f"for distributed training in worker {global_rank}"
         )
 
-        # Till now, "cuda_device" will be a list of ids as configured originally
-        # in params. But a worker trainer needs to only know about its specific
-        # GPU id.
-        params["trainer"]["cuda_device"] = gpu_id
-        params["trainer"]["world_size"] = world_size
-
     trainer_type = params.get("trainer", {}).get("type", "default")
 
     if trainer_type == "default":
diff --git a/allennlp/tests/commands/train_test.py b/allennlp/tests/commands/train_test.py
index 428a6fdc802..007bc9cd5c6 100644
--- a/allennlp/tests/commands/train_test.py
+++ b/allennlp/tests/commands/train_test.py
@@ -102,9 +102,9 @@ def test_train_model_distributed(self):
                 "trainer": {
                     "num_epochs": 2,
                     "optimizer": "adam",
-                    "distributed": True,
-                    "cuda_device": [0, 1],
                 },
+                "distributed": True,
+                "distributed_cuda_devices": [0, 1],
             }
         )
 
@@ -136,7 +136,8 @@ def test_distributed_raises_error_with_no_gpus(self):
                 "train_data_path": SEQUENCE_TAGGING_DATA_PATH,
                 "validation_data_path": SEQUENCE_TAGGING_DATA_PATH,
                 "iterator": {"type": "basic", "batch_size": 2},
-                "trainer": {"num_epochs": 2, "optimizer": "adam", "distributed": True},
+                "trainer": {"num_epochs": 2, "optimizer": "adam"},
+                "distributed": True
             }
         )
         with pytest.raises(ConfigurationError):
@@ -183,8 +184,8 @@ def test_error_is_throw_when_cuda_device_is_not_available(self):
                     "encoder": {"type": "lstm", "input_size": 5, "hidden_size": 7, "num_layers": 2},
                 },
                 "dataset_reader": {"type": "sequence_tagging"},
-                "train_data_path": "tests/fixtures/data/sequence_tagging.tsv",
-                "validation_data_path": "tests/fixtures/data/sequence_tagging.tsv",
+                "train_data_path": "allennlp/tests/fixtures/data/sequence_tagging.tsv",
+                "validation_data_path": "allennlp/tests/fixtures/data/sequence_tagging.tsv",
                 "iterator": {"type": "basic", "batch_size": 2},
                 "trainer": {
                     "num_epochs": 2,
diff --git a/allennlp/training/callback_trainer.py b/allennlp/training/callback_trainer.py
index 201313a2336..d5b05433c79 100644
--- a/allennlp/training/callback_trainer.py
+++ b/allennlp/training/callback_trainer.py
@@ -11,7 +11,7 @@
 import torch
 
 from allennlp.common import Params
-from allennlp.common.checks import parse_cuda_device
+from allennlp.common.checks import parse_cuda_device, check_for_gpu
 from allennlp.common.tqdm import Tqdm
 from allennlp.common.util import lazy_groups_of
 from allennlp.data.instance import Instance
@@ -316,14 +316,11 @@ def from_params(  # type: ignore
         num_epochs = params.pop_int("num_epochs", 20)
         cuda_device = parse_cuda_device(params.pop("cuda_device", -1))
 
-        if isinstance(cuda_device, list):
-            model_device = cuda_device[0]
-        else:
-            model_device = cuda_device
-        if model_device >= 0:
+        check_for_gpu(cuda_device)
+        if cuda_device >= 0:
             # Moving model to GPU here so that the optimizer state gets constructed on
             # the right device.
-            model = model.cuda(model_device)
+            model = model.cuda(cuda_device)
 
         parameters = [[n, p] for n, p in model.named_parameters() if p.requires_grad]
         optimizer = Optimizer.from_params(parameters, params.pop("optimizer"))
diff --git a/allennlp/training/trainer.py b/allennlp/training/trainer.py
index 8f444e57a22..17ce37fe866 100644
--- a/allennlp/training/trainer.py
+++ b/allennlp/training/trainer.py
@@ -12,7 +12,7 @@
 from torch.nn.parallel import DistributedDataParallel
 
 from allennlp.common import Params
-from allennlp.common.checks import ConfigurationError, parse_cuda_device
+from allennlp.common.checks import ConfigurationError, parse_cuda_device, check_for_gpu
 from allennlp.common.tqdm import Tqdm
 from allennlp.common.util import dump_metrics, gpu_memory_mb, peak_memory_mb, lazy_groups_of
 from allennlp.data.instance import Instance
@@ -764,14 +764,11 @@ def from_params(  # type: ignore
         lr_scheduler_params = params.pop("learning_rate_scheduler", None)
         momentum_scheduler_params = params.pop("momentum_scheduler", None)
 
-        if isinstance(cuda_device, list):
-            model_device = cuda_device[0]
-        else:
-            model_device = cuda_device
-        if model_device >= 0:
+        check_for_gpu(cuda_device)
+        if cuda_device >= 0:
             # Moving model to GPU here so that the optimizer state gets constructed on
             # the right device.
-            model = model.cuda(model_device)
+            model = model.cuda(cuda_device)
 
         parameters = [[n, p] for n, p in model.named_parameters() if p.requires_grad]
         optimizer = Optimizer.from_params(parameters, params.pop("optimizer"))
diff --git a/allennlp/training/trainer_base.py b/allennlp/training/trainer_base.py
index 31689c19d97..7f961263722 100644
--- a/allennlp/training/trainer_base.py
+++ b/allennlp/training/trainer_base.py
@@ -36,8 +36,8 @@ def __init__(
         rank: int = 0,
         world_size: int = 1,
     ) -> None:
-        check_for_gpu(cuda_device)
 
+        check_for_gpu(cuda_device)
         self._serialization_dir = serialization_dir
 
         if isinstance(cuda_device, list):

From 833f54d59b7eb2fe40a7ebb270da62db42dd139f Mon Sep 17 00:00:00 2001
From: Mark Neumann <markng@allenai.org>
Date: Fri, 13 Dec 2019 12:52:14 -0800
Subject: [PATCH 07/15] lint

---
 allennlp/commands/find_learning_rate.py | 2 +-
 allennlp/commands/train.py              | 1 -
 allennlp/tests/commands/train_test.py   | 7 ++-----
 allennlp/training/callback_trainer.py   | 6 ++----
 allennlp/training/callbacks/validate.py | 2 --
 allennlp/training/trainer.py            | 5 ++---
 6 files changed, 7 insertions(+), 16 deletions(-)

diff --git a/allennlp/commands/find_learning_rate.py b/allennlp/commands/find_learning_rate.py
index b0347b2d5aa..5556c3f6892 100644
--- a/allennlp/commands/find_learning_rate.py
+++ b/allennlp/commands/find_learning_rate.py
@@ -54,7 +54,7 @@
 from allennlp.commands.subcommand import Subcommand
 from allennlp.common.checks import ConfigurationError, check_for_gpu, parse_cuda_device
 from allennlp.common import Params, Tqdm
-from allennlp.common.util import prepare_environment, lazy_groups_of
+from allennlp.common.util import prepare_environment
 from allennlp.data import Vocabulary, DataIterator
 from allennlp.models import Model
 from allennlp.training import Trainer
diff --git a/allennlp/commands/train.py b/allennlp/commands/train.py
index 26aa0002c14..aa160203bdd 100644
--- a/allennlp/commands/train.py
+++ b/allennlp/commands/train.py
@@ -438,7 +438,6 @@ def _train_worker(
         params["trainer"]["cuda_device"] = gpu_id
         params["trainer"]["world_size"] = world_size
 
-
         torch.cuda.set_device(gpu_id)
         dist.init_process_group(
             backend="nccl",
diff --git a/allennlp/tests/commands/train_test.py b/allennlp/tests/commands/train_test.py
index 007bc9cd5c6..ab98f5dcc06 100644
--- a/allennlp/tests/commands/train_test.py
+++ b/allennlp/tests/commands/train_test.py
@@ -99,10 +99,7 @@ def test_train_model_distributed(self):
                 "train_data_path": SEQUENCE_TAGGING_DATA_PATH,
                 "validation_data_path": SEQUENCE_TAGGING_DATA_PATH,
                 "iterator": {"type": "basic", "batch_size": 2},
-                "trainer": {
-                    "num_epochs": 2,
-                    "optimizer": "adam",
-                },
+                "trainer": {"num_epochs": 2, "optimizer": "adam"},
                 "distributed": True,
                 "distributed_cuda_devices": [0, 1],
             }
@@ -137,7 +134,7 @@ def test_distributed_raises_error_with_no_gpus(self):
                 "validation_data_path": SEQUENCE_TAGGING_DATA_PATH,
                 "iterator": {"type": "basic", "batch_size": 2},
                 "trainer": {"num_epochs": 2, "optimizer": "adam"},
-                "distributed": True
+                "distributed": True,
             }
         )
         with pytest.raises(ConfigurationError):
diff --git a/allennlp/training/callback_trainer.py b/allennlp/training/callback_trainer.py
index d5b05433c79..dcb26fe498c 100644
--- a/allennlp/training/callback_trainer.py
+++ b/allennlp/training/callback_trainer.py
@@ -6,15 +6,13 @@
 import time
 import datetime
 import functools
-import math
 from typing import Dict, Optional, List, Any, Iterable
 import torch
 
 from allennlp.common import Params
 from allennlp.common.checks import parse_cuda_device, check_for_gpu
 from allennlp.common.tqdm import Tqdm
-from allennlp.common.util import lazy_groups_of
-from allennlp.data.instance import Instance
+from allennlp.data import Instance
 from allennlp.data.iterators.data_iterator import DataIterator, TensorDict
 from allennlp.models.model import Model
 from allennlp.nn import util as nn_util
@@ -345,7 +343,7 @@ def from_params(  # type: ignore
         world_size = params.pop_int("world_size", 1)
 
         if distributed:
-            rank = model_device
+            rank = cuda_device
         else:
             rank = 0
 
diff --git a/allennlp/training/callbacks/validate.py b/allennlp/training/callbacks/validate.py
index 973164f071f..614f34905f6 100644
--- a/allennlp/training/callbacks/validate.py
+++ b/allennlp/training/callbacks/validate.py
@@ -1,11 +1,9 @@
 from typing import Iterable, List, TYPE_CHECKING
 import logging
-import math
 
 import torch
 
 from allennlp.common.tqdm import Tqdm
-from allennlp.common.util import lazy_groups_of
 from allennlp.data.instance import Instance
 from allennlp.data.iterators import DataIterator
 from allennlp.training import util as training_util
diff --git a/allennlp/training/trainer.py b/allennlp/training/trainer.py
index 17ce37fe866..3725325b989 100644
--- a/allennlp/training/trainer.py
+++ b/allennlp/training/trainer.py
@@ -1,10 +1,9 @@
 import datetime
 import logging
-import math
 import os
 import time
 import traceback
-from typing import Dict, Optional, List, Tuple, Union, Iterable, Any
+from typing import Dict, Optional, Tuple, Union, Iterable, Any
 
 import torch
 import torch.distributed as dist
@@ -14,7 +13,7 @@
 from allennlp.common import Params
 from allennlp.common.checks import ConfigurationError, parse_cuda_device, check_for_gpu
 from allennlp.common.tqdm import Tqdm
-from allennlp.common.util import dump_metrics, gpu_memory_mb, peak_memory_mb, lazy_groups_of
+from allennlp.common.util import dump_metrics, gpu_memory_mb, peak_memory_mb
 from allennlp.data.instance import Instance
 from allennlp.data.iterators.data_iterator import DataIterator, TensorDict
 from allennlp.models.model import Model

From 1ba654c129091f98496cf31e3652aed489ab3aa1 Mon Sep 17 00:00:00 2001
From: Mark Neumann <markng@allenai.org>
Date: Fri, 13 Dec 2019 13:11:07 -0800
Subject: [PATCH 08/15] clean up

---
 allennlp/commands/train.py                       |  9 ++++-----
 .../tests/training/gan_callback_trainer_test.py  |  6 +++---
 allennlp/training/callback_trainer.py            | 16 ++++++++++------
 .../training/callbacks/log_to_tensorboard.py     |  2 +-
 allennlp/training/trainer.py                     |  4 ++++
 5 files changed, 22 insertions(+), 15 deletions(-)

diff --git a/allennlp/commands/train.py b/allennlp/commands/train.py
index aa160203bdd..8f49aebe52e 100644
--- a/allennlp/commands/train.py
+++ b/allennlp/commands/train.py
@@ -56,7 +56,7 @@
 from allennlp.commands.make_vocab import make_vocab_from_params
 from allennlp.commands.subcommand import Subcommand
 from allennlp.common import Params
-from allennlp.common.checks import ConfigurationError, check_for_gpu, parse_cuda_device
+from allennlp.common.checks import ConfigurationError, check_for_gpu
 from allennlp.common.util import (
     prepare_environment,
     prepare_global_logging,
@@ -269,11 +269,10 @@ def train_model(
     create_serialization_dir(params, serialization_dir, recover, force)
     params.to_file(os.path.join(serialization_dir, CONFIG_NAME))
 
-    cuda_device = params.params.pop("distributed_cuda_devices", -1)
-    device_ids = parse_cuda_device(cuda_device)
-    check_for_gpu(cuda_device)
+    device_ids = params.params.pop("distributed_cuda_devices", -1)
+    check_for_gpu(device_ids)
 
-    multi_device = isinstance(device_ids, list)
+    multi_device = isinstance(device_ids, list) and len(device_ids) > 1
     distributed = params.params.pop("distributed", False)
 
     # If distributed isn't in the config and the config contains strictly
diff --git a/allennlp/tests/training/gan_callback_trainer_test.py b/allennlp/tests/training/gan_callback_trainer_test.py
index 05af33b5a87..b10e9c9de70 100644
--- a/allennlp/tests/training/gan_callback_trainer_test.py
+++ b/allennlp/tests/training/gan_callback_trainer_test.py
@@ -207,7 +207,7 @@ def __init__(
         num_epochs: int = 20,
         shuffle: bool = False,
         serialization_dir: Optional[str] = None,
-        cuda_device: Union[int, List] = -1,
+        cuda_device: int = -1,
         callbacks: List[Callback] = None,
         distributed: bool = False,
         rank: int = 0,
@@ -235,9 +235,9 @@ def _reset_counters(self) -> None:
         self.fake_stdev = 0.0
         self.count = 0
 
-    def train_one_batch_group(self, batch_group):
+    def train_one_batch(self, batch_group):
         # Each batch_group should have only one batch
-        batch, = batch_group
+        batch = batch_group
         array = batch["array"]
 
         # We should not have mixed batches:
diff --git a/allennlp/training/callback_trainer.py b/allennlp/training/callback_trainer.py
index dcb26fe498c..9189516c460 100644
--- a/allennlp/training/callback_trainer.py
+++ b/allennlp/training/callback_trainer.py
@@ -10,7 +10,7 @@
 import torch
 
 from allennlp.common import Params
-from allennlp.common.checks import parse_cuda_device, check_for_gpu
+from allennlp.common.checks import parse_cuda_device, check_for_gpu, ConfigurationError
 from allennlp.common.tqdm import Tqdm
 from allennlp.data import Instance
 from allennlp.data.iterators.data_iterator import DataIterator, TensorDict
@@ -125,7 +125,7 @@ def __init__(
         self.metrics: Dict[str, Any] = {}
 
         self.batch_num_total = 0
-        self.batch_group: List[TensorDict] = []
+        self.batch: TensorDict = None
         self.batches_this_epoch = 0
 
         self.training_batches: Iterable[List[TensorDict]] = ()
@@ -194,7 +194,7 @@ def batch_loss(self, batch: TensorDict, for_training: bool) -> torch.Tensor:
 
         return loss
 
-    def train_one_batch_group(self, batch: TensorDict) -> str:
+    def train_one_batch(self, batch: TensorDict) -> str:
         """
         Handles the training for a single batch group.
         Fires off the events BATCH_START, FORWARD, BACKWARD, and BATCH_END.
@@ -231,7 +231,7 @@ def train_one_epoch(self) -> None:
         """
         Trains the model for a single epoch.
         Fires off the events EPOCH_START and EPOCH_END,
-        and repeatedly calls self.train_one_batch_group().
+        and repeatedly calls self.train_one_batch().
         """
         self.handler.fire_event(Events.EPOCH_START)
 
@@ -246,8 +246,8 @@ def train_one_epoch(self) -> None:
 
         batches_tqdm = Tqdm.tqdm(self.training_batches, total=self.num_training_batches)
 
-        for self.batch_group in batches_tqdm:
-            description = self.train_one_batch_group(self.batch_group)
+        for self.batch in batches_tqdm:
+            description = self.train_one_batch(self.batch)
             batches_tqdm.set_description(description, refresh=False)
 
         self.handler.fire_event(Events.VALIDATE)
@@ -315,6 +315,10 @@ def from_params(  # type: ignore
         cuda_device = parse_cuda_device(params.pop("cuda_device", -1))
 
         check_for_gpu(cuda_device)
+        if isinstance(cuda_device, list):
+            raise ConfigurationError(
+                "In allennlp 1.0, the Trainer cannot be passed multiple cuda devices."
+            )
         if cuda_device >= 0:
             # Moving model to GPU here so that the optimizer state gets constructed on
             # the right device.
diff --git a/allennlp/training/callbacks/log_to_tensorboard.py b/allennlp/training/callbacks/log_to_tensorboard.py
index c263eac0667..752ab496124 100644
--- a/allennlp/training/callbacks/log_to_tensorboard.py
+++ b/allennlp/training/callbacks/log_to_tensorboard.py
@@ -82,7 +82,7 @@ def batch_end_logging(self, trainer: "CallbackTrainer"):
             )
 
         if self.log_batch_size_period:
-            cur_batch = sum([training_util.get_batch_size(batch) for batch in trainer.batch_group])
+            cur_batch = training_util.get_batch_size(trainer.batch)
             self.cumulative_batch_size += cur_batch
             if (trainer.batches_this_epoch - 1) % self.log_batch_size_period == 0:
                 average = self.cumulative_batch_size / trainer.batches_this_epoch
diff --git a/allennlp/training/trainer.py b/allennlp/training/trainer.py
index 3725325b989..805491b3151 100644
--- a/allennlp/training/trainer.py
+++ b/allennlp/training/trainer.py
@@ -764,6 +764,10 @@ def from_params(  # type: ignore
         momentum_scheduler_params = params.pop("momentum_scheduler", None)
 
         check_for_gpu(cuda_device)
+        if isinstance(cuda_device, list):
+            raise ConfigurationError(
+                "In allennlp 1.0, the Trainer cannot be passed multiple cuda devices."
+            )
         if cuda_device >= 0:
             # Moving model to GPU here so that the optimizer state gets constructed on
             # the right device.

From 5358eed56edbe1ba349842b92e5de28f87555738 Mon Sep 17 00:00:00 2001
From: Mark Neumann <markng@allenai.org>
Date: Fri, 13 Dec 2019 13:18:43 -0800
Subject: [PATCH 09/15] rename occurences of batch_group

---
 allennlp/commands/find_learning_rate.py              |  4 ++--
 allennlp/tests/training/gan_callback_trainer_test.py |  8 +++-----
 allennlp/training/callbacks/validate.py              |  4 ++--
 allennlp/training/trainer.py                         | 10 +++++-----
 4 files changed, 12 insertions(+), 14 deletions(-)

diff --git a/allennlp/commands/find_learning_rate.py b/allennlp/commands/find_learning_rate.py
index 5556c3f6892..95141012375 100644
--- a/allennlp/commands/find_learning_rate.py
+++ b/allennlp/commands/find_learning_rate.py
@@ -316,7 +316,7 @@ def search_learning_rate(
     else:
         lr_update_factor = (end_lr / start_lr) ** (1.0 / num_batches)
 
-    for i, batch_group in enumerate(train_generator_tqdm):
+    for i, batch in enumerate(train_generator_tqdm):
 
         if linear_steps:
             current_lr = start_lr + (lr_update_factor * i)
@@ -327,7 +327,7 @@ def search_learning_rate(
             param_group["lr"] = current_lr
 
         trainer.optimizer.zero_grad()
-        loss = trainer.batch_loss(batch_group, for_training=True)
+        loss = trainer.batch_loss(batch, for_training=True)
         loss.backward()
         loss = loss.detach().cpu().item()
 
diff --git a/allennlp/tests/training/gan_callback_trainer_test.py b/allennlp/tests/training/gan_callback_trainer_test.py
index b10e9c9de70..c68a19f6530 100644
--- a/allennlp/tests/training/gan_callback_trainer_test.py
+++ b/allennlp/tests/training/gan_callback_trainer_test.py
@@ -235,11 +235,9 @@ def _reset_counters(self) -> None:
         self.fake_stdev = 0.0
         self.count = 0
 
-    def train_one_batch(self, batch_group):
-        # Each batch_group should have only one batch
-        batch = batch_group
-        array = batch["array"]
+    def train_one_batch(self, batch):
 
+        array = batch["array"]
         # We should not have mixed batches:
         if len(set(batch["stage"])) != 1:
             raise ValueError("mixed batch")
@@ -290,7 +288,7 @@ def train_one_epoch(self) -> None:
         # Reset epoch counters
         self._reset_counters()
 
-        # Will call `self.train_one_batch_group`
+        # Will call `self.train_one_batch`
         super().train_one_epoch()
 
 
diff --git a/allennlp/training/callbacks/validate.py b/allennlp/training/callbacks/validate.py
index 614f34905f6..32f22e655f6 100644
--- a/allennlp/training/callbacks/validate.py
+++ b/allennlp/training/callbacks/validate.py
@@ -71,9 +71,9 @@ def validate(self, trainer: "CallbackTrainer"):
 
             batches_this_epoch = 0
             val_loss = 0
-            for batch_group in val_generator_tqdm:
+            for batch in val_generator_tqdm:
 
-                loss = trainer.batch_loss(batch_group, for_training=False)
+                loss = trainer.batch_loss(batch, for_training=False)
                 if loss is not None:
                     # You shouldn't necessarily have to compute a loss for validation, so we allow for
                     # `loss` to be None.  We need to be careful, though - `batches_this_epoch` is
diff --git a/allennlp/training/trainer.py b/allennlp/training/trainer.py
index 805491b3151..021573db0de 100644
--- a/allennlp/training/trainer.py
+++ b/allennlp/training/trainer.py
@@ -343,14 +343,14 @@ def _train_epoch(self, epoch: int) -> Dict[str, float]:
             train_generator_tqdm = train_generator
 
         cumulative_batch_size = 0
-        for batch_group in train_generator_tqdm:
+        for batch in train_generator_tqdm:
             batches_this_epoch += 1
             self._batch_num_total += 1
             batch_num_total = self._batch_num_total
 
             self.optimizer.zero_grad()
 
-            loss = self.batch_loss(batch_group, for_training=True)
+            loss = self.batch_loss(batch, for_training=True)
 
             if torch.isnan(loss):
                 raise ValueError("nan loss encountered")
@@ -417,7 +417,7 @@ def _train_epoch(self, epoch: int) -> Dict[str, float]:
                 self._tensorboard.log_histograms(self.model, histogram_parameters)
 
             if self._log_batch_size_period:
-                cur_batch = sum([training_util.get_batch_size(batch) for batch in batch_group])
+                cur_batch = training_util.get_batch_size(batch)
                 cumulative_batch_size += cur_batch
                 if (batches_this_epoch - 1) % self._log_batch_size_period == 0:
                     average = cumulative_batch_size / batches_this_epoch
@@ -470,9 +470,9 @@ def _validation_loss(self) -> Tuple[float, int]:
         val_generator_tqdm = Tqdm.tqdm(val_generator, total=num_validation_batches)
         batches_this_epoch = 0
         val_loss = 0
-        for batch_group in val_generator_tqdm:
+        for batch in val_generator_tqdm:
 
-            loss = self.batch_loss(batch_group, for_training=False)
+            loss = self.batch_loss(batch, for_training=False)
             if loss is not None:
                 # You shouldn't necessarily have to compute a loss for validation, so we allow for
                 # `loss` to be None.  We need to be careful, though - `batches_this_epoch` is

From 8d810ea3d6f9e1ed467095348d71613206e541a7 Mon Sep 17 00:00:00 2001
From: Mark Neumann <markng@allenai.org>
Date: Fri, 13 Dec 2019 13:23:20 -0800
Subject: [PATCH 10/15] remove hack from find_learning_rate

---
 allennlp/commands/find_learning_rate.py | 10 +---------
 1 file changed, 1 insertion(+), 9 deletions(-)

diff --git a/allennlp/commands/find_learning_rate.py b/allennlp/commands/find_learning_rate.py
index 95141012375..c2af08f16f9 100644
--- a/allennlp/commands/find_learning_rate.py
+++ b/allennlp/commands/find_learning_rate.py
@@ -52,7 +52,7 @@
 import shutil
 
 from allennlp.commands.subcommand import Subcommand
-from allennlp.common.checks import ConfigurationError, check_for_gpu, parse_cuda_device
+from allennlp.common.checks import ConfigurationError, check_for_gpu
 from allennlp.common import Params, Tqdm
 from allennlp.common.util import prepare_environment
 from allennlp.data import Vocabulary, DataIterator
@@ -193,14 +193,6 @@ def find_learning_rate_model(
     prepare_environment(params)
 
     cuda_device = params.params.get("trainer").get("cuda_device", -1)
-    devices = parse_cuda_device(cuda_device)
-
-    # HACK: The trainer can not be constructed with multiple gpus.
-    # TODO(Mark): rework this so that cuda devices for distributed training are passed
-    # somewhere else, so configs are always valid.
-    if isinstance(devices, list):
-        cuda_device = devices[0]
-        params.params["trainer"]["cuda_device"] = cuda_device
     check_for_gpu(cuda_device)
 
     all_datasets = datasets_from_params(params)

From b014b9f1a55ea6f1739a608ea982989c0fc57ab1 Mon Sep 17 00:00:00 2001
From: Mark Neumann <markng@allenai.org>
Date: Fri, 13 Dec 2019 13:29:11 -0800
Subject: [PATCH 11/15] fix last tests

---
 allennlp/tests/models/simple_tagger_test.py      | 8 ++++----
 allennlp/tests/training/callback_trainer_test.py | 1 +
 allennlp/tests/training/trainer_test.py          | 1 +
 3 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/allennlp/tests/models/simple_tagger_test.py b/allennlp/tests/models/simple_tagger_test.py
index 8f3d103ae9d..eae4590e627 100644
--- a/allennlp/tests/models/simple_tagger_test.py
+++ b/allennlp/tests/models/simple_tagger_test.py
@@ -63,8 +63,8 @@ def test_regularization(self):
         training_batch = next(iterator(self.instances, num_epochs=1))
         validation_batch = next(iterator(self.instances, num_epochs=1))
 
-        training_loss = trainer.batch_loss([training_batch], for_training=True).item()
-        validation_loss = trainer.batch_loss([validation_batch], for_training=False).item()
+        training_loss = trainer.batch_loss(training_batch, for_training=True).item()
+        validation_loss = trainer.batch_loss(validation_batch, for_training=False).item()
 
         # Training loss should have the regularization penalty, but validation loss should not.
         numpy.testing.assert_almost_equal(training_loss, validation_loss)
@@ -124,8 +124,8 @@ def test_regularization(self):
         training_batch = next(self.iterator(self.instances, num_epochs=1))
         validation_batch = next(self.iterator(self.instances, num_epochs=1))
 
-        training_loss = self.trainer.batch_loss([training_batch], for_training=True).data
-        validation_loss = self.trainer.batch_loss([validation_batch], for_training=False).data
+        training_loss = self.trainer.batch_loss(training_batch, for_training=True).data
+        validation_loss = self.trainer.batch_loss(validation_batch, for_training=False).data
 
         # Training loss should have the regularization penalty, but validation loss should not.
         assert (training_loss != validation_loss).all()
diff --git a/allennlp/tests/training/callback_trainer_test.py b/allennlp/tests/training/callback_trainer_test.py
index 2d9229d806f..67fc8ace435 100644
--- a/allennlp/tests/training/callback_trainer_test.py
+++ b/allennlp/tests/training/callback_trainer_test.py
@@ -269,6 +269,7 @@ def test_trainer_can_run_cuda(self):
         assert "peak_gpu_0_memory_MB" in metrics
         assert isinstance(metrics["peak_gpu_0_memory_MB"], int)
 
+    @pytest.mark.skipif(torch.cuda.device_count() < 2 , reason="2 or more GPUs required.")
     def test_passing_trainer_multiple_gpus_raises_error(self):
         self.model.cuda()
 
diff --git a/allennlp/tests/training/trainer_test.py b/allennlp/tests/training/trainer_test.py
index 1a182e4630f..1a520facd59 100644
--- a/allennlp/tests/training/trainer_test.py
+++ b/allennlp/tests/training/trainer_test.py
@@ -114,6 +114,7 @@ def test_trainer_can_run_cuda(self):
         assert "peak_gpu_0_memory_MB" in metrics
         assert isinstance(metrics["peak_gpu_0_memory_MB"], int)
 
+    @pytest.mark.skipif(torch.cuda.device_count() < 2 , reason="2 or more GPUs required.")
     def test_passing_trainer_multiple_gpus_raises_error(self):
         self.model.cuda()
 

From df23b16c23d8801efc2156bdb5645177ed6b2e20 Mon Sep 17 00:00:00 2001
From: Mark Neumann <markng@allenai.org>
Date: Fri, 13 Dec 2019 13:32:19 -0800
Subject: [PATCH 12/15] black

---
 allennlp/tests/training/callback_trainer_test.py | 2 +-
 allennlp/tests/training/trainer_test.py          | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/allennlp/tests/training/callback_trainer_test.py b/allennlp/tests/training/callback_trainer_test.py
index 67fc8ace435..60af37a4a81 100644
--- a/allennlp/tests/training/callback_trainer_test.py
+++ b/allennlp/tests/training/callback_trainer_test.py
@@ -269,7 +269,7 @@ def test_trainer_can_run_cuda(self):
         assert "peak_gpu_0_memory_MB" in metrics
         assert isinstance(metrics["peak_gpu_0_memory_MB"], int)
 
-    @pytest.mark.skipif(torch.cuda.device_count() < 2 , reason="2 or more GPUs required.")
+    @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="2 or more GPUs required.")
     def test_passing_trainer_multiple_gpus_raises_error(self):
         self.model.cuda()
 
diff --git a/allennlp/tests/training/trainer_test.py b/allennlp/tests/training/trainer_test.py
index 1a520facd59..f479c9cf62f 100644
--- a/allennlp/tests/training/trainer_test.py
+++ b/allennlp/tests/training/trainer_test.py
@@ -114,7 +114,7 @@ def test_trainer_can_run_cuda(self):
         assert "peak_gpu_0_memory_MB" in metrics
         assert isinstance(metrics["peak_gpu_0_memory_MB"], int)
 
-    @pytest.mark.skipif(torch.cuda.device_count() < 2 , reason="2 or more GPUs required.")
+    @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="2 or more GPUs required.")
     def test_passing_trainer_multiple_gpus_raises_error(self):
         self.model.cuda()
 

From 69db3d4b9d79fb1c466f12bf025e42fb8778b66e Mon Sep 17 00:00:00 2001
From: Mark Neumann <markng@allenai.org>
Date: Fri, 13 Dec 2019 14:37:48 -0800
Subject: [PATCH 13/15] use a top level distributed config

---
 allennlp/commands/train.py            | 33 +++++++++++++--------------
 allennlp/tests/commands/train_test.py |  5 ++--
 2 files changed, 18 insertions(+), 20 deletions(-)

diff --git a/allennlp/commands/train.py b/allennlp/commands/train.py
index 8f49aebe52e..95631336916 100644
--- a/allennlp/commands/train.py
+++ b/allennlp/commands/train.py
@@ -269,15 +269,10 @@ def train_model(
     create_serialization_dir(params, serialization_dir, recover, force)
     params.to_file(os.path.join(serialization_dir, CONFIG_NAME))
 
-    device_ids = params.params.pop("distributed_cuda_devices", -1)
-    check_for_gpu(device_ids)
-
-    multi_device = isinstance(device_ids, list) and len(device_ids) > 1
-    distributed = params.params.pop("distributed", False)
-
+    distributed_params = params.params.pop("distributed", None)
     # If distributed isn't in the config and the config contains strictly
     # one cuda device, we just run a single training process.
-    if not distributed:
+    if distributed_params is None:
         model = _train_worker(
             process_rank=0,
             params=params,
@@ -291,19 +286,23 @@ def train_model(
         archive_model(serialization_dir, files_to_archive=params.files_to_archive)
         return model
 
-    # If the config contains the distributed flag, but only one GPU, we raise an error,
-    # because this combination is probably a mistake.
-    elif distributed and not multi_device:
-        raise ConfigurationError(
-            "Multiple cuda devices need to be configured to run distributed training."
-        )
-
     # Otherwise, we are running multiple processes for training.
     else:
-        master_addr = params.params.pop("master_address", "127.0.0.1")
-        master_port = params.params.pop("master_port", 29500)
+        # We are careful here so that we can raise a good error if someone
+        # passed the wrong thing - cuda_devices are required.
+        device_ids = distributed_params.pop("cuda_devices", None)
+        multi_device = isinstance(device_ids, list) and len(device_ids) > 1
+
+        if not multi_device:
+            raise ConfigurationError(
+                "Multiple cuda devices need to be configured to run distributed training."
+            )
+        check_for_gpu(device_ids)
+
+        master_addr = distributed_params.pop("master_address", "127.0.0.1")
+        master_port = distributed_params.pop("master_port", 29500)
         num_procs = len(device_ids)
-        num_nodes = params.params.pop("num_nodes", 1)
+        num_nodes = distributed_params.pop("num_nodes", 1)
         world_size = num_nodes * num_procs
 
         os.environ["MASTER_ADDR"] = master_addr
diff --git a/allennlp/tests/commands/train_test.py b/allennlp/tests/commands/train_test.py
index ab98f5dcc06..934e79ad91f 100644
--- a/allennlp/tests/commands/train_test.py
+++ b/allennlp/tests/commands/train_test.py
@@ -100,8 +100,7 @@ def test_train_model_distributed(self):
                 "validation_data_path": SEQUENCE_TAGGING_DATA_PATH,
                 "iterator": {"type": "basic", "batch_size": 2},
                 "trainer": {"num_epochs": 2, "optimizer": "adam"},
-                "distributed": True,
-                "distributed_cuda_devices": [0, 1],
+                "distributed": {"cuda_devices": [0, 1]},
             }
         )
 
@@ -134,7 +133,7 @@ def test_distributed_raises_error_with_no_gpus(self):
                 "validation_data_path": SEQUENCE_TAGGING_DATA_PATH,
                 "iterator": {"type": "basic", "batch_size": 2},
                 "trainer": {"num_epochs": 2, "optimizer": "adam"},
-                "distributed": True,
+                "distributed": {},
             }
         )
         with pytest.raises(ConfigurationError):

From bc2c2d1d29a9bd9f69d58518a4c80d0598161bbc Mon Sep 17 00:00:00 2001
From: Mark Neumann <markng@allenai.org>
Date: Fri, 13 Dec 2019 14:45:50 -0800
Subject: [PATCH 14/15] correct error for int

---
 allennlp/training/trainer_base.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/allennlp/training/trainer_base.py b/allennlp/training/trainer_base.py
index 7f961263722..65e336fb18f 100644
--- a/allennlp/training/trainer_base.py
+++ b/allennlp/training/trainer_base.py
@@ -31,7 +31,7 @@ class TrainerBase(Registrable):
     def __init__(
         self,
         serialization_dir: str,
-        cuda_device: Union[int, List] = -1,
+        cuda_device: int = -1,
         distributed: bool = False,
         rank: int = 0,
         world_size: int = 1,
@@ -49,7 +49,7 @@ def __init__(
 
         if not isinstance(cuda_device, int):
             raise ConfigurationError(
-                "Expected an int or list for cuda_device, got {}".format(cuda_device)
+                "Expected an int for cuda_device, got {}".format(cuda_device)
             )
 
         if distributed and world_size <= 1:

From 2398e8f57153cde8d76de0311f8bbf69ff4a56e9 Mon Sep 17 00:00:00 2001
From: Mark Neumann <markng@allenai.org>
Date: Fri, 13 Dec 2019 15:09:22 -0800
Subject: [PATCH 15/15] change up parse_cuda_devices to raise good error and be
 strongly typed

---
 allennlp/commands/train.py            |  1 +
 allennlp/common/checks.py             | 29 +++++++++++++++++++++++----
 allennlp/training/callback_trainer.py |  6 +-----
 allennlp/training/trainer.py          |  4 ----
 allennlp/training/trainer_base.py     |  6 ++----
 5 files changed, 29 insertions(+), 17 deletions(-)

diff --git a/allennlp/commands/train.py b/allennlp/commands/train.py
index 95631336916..105b205e098 100644
--- a/allennlp/commands/train.py
+++ b/allennlp/commands/train.py
@@ -435,6 +435,7 @@ def _train_worker(
         # But a worker trainer needs to only know about its specific GPU id.
         params["trainer"]["cuda_device"] = gpu_id
         params["trainer"]["world_size"] = world_size
+        params["trainer"]["distributed"] = True
 
         torch.cuda.set_device(gpu_id)
         dist.init_process_group(
diff --git a/allennlp/common/checks.py b/allennlp/common/checks.py
index 6ef6450aeb9..c15fe37c12e 100644
--- a/allennlp/common/checks.py
+++ b/allennlp/common/checks.py
@@ -52,14 +52,36 @@ def check_dimensions_match(
         )
 
 
-def parse_cuda_device(cuda_device: Union[str, int, List[int]]) -> Union[int, List[int]]:
+def parse_cuda_device(cuda_device: Union[str, int, List[int]]) -> int:
     """
     Disambiguates single GPU and multiple GPU settings for cuda_device param.
     """
 
+    message = """
+    In allennlp 1.0, the Trainer cannot be passed multiple cuda devices.
+    Instead, use the faster Distributed Data Parallel. For instance, if you previously had config like:
+        {
+          "trainer": {
+            "cuda_device": [0, 1, 2, 3],
+            "num_epochs": 20,
+            ...
+          }
+        }
+        simply change it to:
+        {
+          "distributed": {
+            "cuda_devices": [0, 1, 2, 3],
+          },
+          "trainer": {
+            "num_epochs": 20,
+            ...
+          }
+        }
+        """
+
     def from_list(strings):
         if len(strings) > 1:
-            return [int(d) for d in strings]
+            raise ConfigurationError(message)
         elif len(strings) == 1:
             return int(strings[0])
         else:
@@ -76,8 +98,7 @@ def from_list(strings):
         return int(cuda_device)  # type: ignore
 
 
-def check_for_gpu(device_id: Union[int, list]):
-    device_id = parse_cuda_device(device_id)
+def check_for_gpu(device_id: Union[int, List[int]]):
     if isinstance(device_id, list):
         for did in device_id:
             check_for_gpu(did)
diff --git a/allennlp/training/callback_trainer.py b/allennlp/training/callback_trainer.py
index 9189516c460..8f302e4c5d6 100644
--- a/allennlp/training/callback_trainer.py
+++ b/allennlp/training/callback_trainer.py
@@ -10,7 +10,7 @@
 import torch
 
 from allennlp.common import Params
-from allennlp.common.checks import parse_cuda_device, check_for_gpu, ConfigurationError
+from allennlp.common.checks import parse_cuda_device, check_for_gpu
 from allennlp.common.tqdm import Tqdm
 from allennlp.data import Instance
 from allennlp.data.iterators.data_iterator import DataIterator, TensorDict
@@ -315,10 +315,6 @@ def from_params(  # type: ignore
         cuda_device = parse_cuda_device(params.pop("cuda_device", -1))
 
         check_for_gpu(cuda_device)
-        if isinstance(cuda_device, list):
-            raise ConfigurationError(
-                "In allennlp 1.0, the Trainer cannot be passed multiple cuda devices."
-            )
         if cuda_device >= 0:
             # Moving model to GPU here so that the optimizer state gets constructed on
             # the right device.
diff --git a/allennlp/training/trainer.py b/allennlp/training/trainer.py
index 021573db0de..40eaac7d604 100644
--- a/allennlp/training/trainer.py
+++ b/allennlp/training/trainer.py
@@ -764,10 +764,6 @@ def from_params(  # type: ignore
         momentum_scheduler_params = params.pop("momentum_scheduler", None)
 
         check_for_gpu(cuda_device)
-        if isinstance(cuda_device, list):
-            raise ConfigurationError(
-                "In allennlp 1.0, the Trainer cannot be passed multiple cuda devices."
-            )
         if cuda_device >= 0:
             # Moving model to GPU here so that the optimizer state gets constructed on
             # the right device.
diff --git a/allennlp/training/trainer_base.py b/allennlp/training/trainer_base.py
index 65e336fb18f..d3530c936ab 100644
--- a/allennlp/training/trainer_base.py
+++ b/allennlp/training/trainer_base.py
@@ -9,7 +9,7 @@
 
 
 import logging
-from typing import Dict, List, Union, Any
+from typing import Dict, Any
 
 from allennlp.common import Params, Registrable
 from allennlp.common.util import is_master
@@ -48,9 +48,7 @@ def __init__(
             )
 
         if not isinstance(cuda_device, int):
-            raise ConfigurationError(
-                "Expected an int for cuda_device, got {}".format(cuda_device)
-            )
+            raise ConfigurationError("Expected an int for cuda_device, got {}".format(cuda_device))
 
         if distributed and world_size <= 1:
             raise ConfigurationError(