diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 3e13cf674f7..e733af05bc9 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -151,7 +151,8 @@ jobs:
         ALLENNLP_VERSION_OVERRIDE: ""  # Don't replace the core library.
       run: |
         git clone https://github.com/allenai/allennlp-models.git
-        cd allennlp-models && pip install --upgrade --upgrade-strategy eager -e . -r dev-requirements.txt
+        cd allennlp-models
+        pip install --upgrade --upgrade-strategy eager -e . -r dev-requirements.txt
 
     - name: Run models tests
       run: |
@@ -288,11 +289,11 @@ jobs:
       run: |
         # Check the install instructions on https://pytorch.org/ to keep these up-to-date.
         if [[ $CUDA == '10.1' ]]; then
-            echo "DOCKER_TORCH_VERSION='torch==1.7.1+cu101 -f https://download.pytorch.org/whl/torch_stable.html'" >> $GITHUB_ENV;
+            echo "DOCKER_TORCH_VERSION='torch==1.7.1+cu101 torchvision==0.8.2+cu101 -f https://download.pytorch.org/whl/torch_stable.html'" >> $GITHUB_ENV;
         elif [[ $CUDA == '10.2' ]]; then
             echo "DOCKER_TORCH_VERSION='torch==1.7.1'" >> $GITHUB_ENV;
         elif [[ $CUDA == '11.0' ]]; then
-            echo "DOCKER_TORCH_VERSION='torch==1.7.1+cu110 -f https://download.pytorch.org/whl/torch_stable.html'" >> $GITHUB_ENV;
+            echo "DOCKER_TORCH_VERSION='torch==1.7.1+cu110 torchvision==0.8.2+cu110 -f https://download.pytorch.org/whl/torch_stable.html'" >> $GITHUB_ENV;
         else
             echo "Unhandled CUDA version $CUDA";
             exit 1;
@@ -389,15 +390,19 @@ jobs:
       run: |
         ./scripts/build_docs.sh
 
+    - name: Print the ref
+      run: |
+        echo ${{ github.ref }}
+
     - name: Configure Git
-      if: github.event_name == 'release' || github.event_name == 'push'
+      if: github.event_name == 'release' || (github.event_name == 'push' && github.ref == 'refs/heads/main')
       run: |
         git config --global user.email "ai2service@allenai.org"
         git config --global user.name "ai2service"
         git config --global push.default simple
 
     - name: Stage docs
-      if: github.event_name == 'release' || github.event_name == 'push'
+      if: github.event_name == 'release' || (github.event_name == 'push' && github.ref == 'refs/heads/main')
       run: |
         echo "Staging docs to $DOCS_FOLDER"
 
@@ -449,7 +454,7 @@ jobs:
         EOL
 
     - name: Deploy docs
-      if: github.event_name == 'release' || github.event_name == 'push'
+      if: github.event_name == 'release' || (github.event_name == 'push' && github.ref == 'refs/heads/main')
       run: |
         # And push them up to GitHub
         cd ~/allennlp-docs/
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 5197962fcc2..4dd9a95b473 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -10,6 +10,49 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Added
 
+- The `TrainerCallback` constructor accepts `serialization_dir` provided by `Trainer`. This can be useful for `Logger` callbacks those need to store files in the run directory.
+- The `TrainerCallback.on_start()` is fired at the start of the training.
+- The `TrainerCallback` event methods now accept `**kwargs`. This may be useful to maintain backwards-compability of callbacks easier in the future. E.g. we may decide to pass the exception/traceback object in case of failure to `on_end()` and this older callbacks may simply ignore the argument instead of raising a `TypeError`.
+
+### Changed
+
+- The `TrainerCallack.on_epoch()` does not fire with `epoch=-1` at the start of the training.
+  Instead, `TrainerCallback.on_start()` should be used for these cases.
+- `TensorBoardBatchMemoryUsage` is converted from `BatchCallback` into `TrainerCallback`.
+- `TrackEpochCallback` is converted from `EpochCallback` into `TrainerCallback`.
+- `Trainer` can accept callbacks simply with name `callbacks` instead of `trainer_callbacks`.
+
+### Removed
+
+- Removed `EpochCallback`, `BatchCallback` in favour of `TrainerCallback`.
+  The metaclass-wrapping implementation is removed as well.
+
+### Fixed
+
+- Now Trainer always fires `TrainerCallback.on_end()` so all the resources can be cleaned up properly.
+- Fixed the misspelling, changed `TensoboardBatchMemoryUsage` to `TensorBoardBatchMemoryUsage`.
+- We set a value to `epoch` so in case of firing `TrainerCallback.on_end()` the variable is bound.
+  This could have lead to an error in case of trying to recover a run after it was finished training.
+
+
+## [v2.0.0rc1](https://github.com/allenai/allennlp/releases/tag/v2.0.0rc1) - 2021-01-21
+
+### Added
+
+- Added `TensorCache` class for caching tensors on disk
+- Added abstraction and concrete implementation for image loading
+- Added abstraction and concrete implementation for `GridEmbedder`
+- Added abstraction and demo implementation for an image augmentation module.
+- Added abstraction and concrete implementation for region detectors.
+- A new high-performance default `DataLoader`: `MultiProcessDataLoading`.
+- A `MultiTaskModel` and abstractions to use with it, including `Backbone` and `Head`.  The
+  `MultiTaskModel` first runs its inputs through the `Backbone`, then passes the result (and
+  whatever other relevant inputs it got) to each `Head` that's in use.
+- A `MultiTaskDataLoader`, with a corresponding `MultiTaskDatasetReader`, and a couple of new
+  configuration objects: `MultiTaskEpochSampler` (for deciding what proportion to sample from each
+  dataset at every epoch) and a `MultiTaskScheduler` (for ordering the instances within an epoch).
+- Transformer toolkit to plug and play with modular components of transformer architectures.
+- Added a command to count the number of instances we're going to be training with
 - Added a `FileLock` class to `common.file_utils`. This is just like the `FileLock` from the `filelock` library, except that
   it adds an optional flag `read_only_ok: bool`, which when set to `True` changes the behavior so that a warning will be emitted
   instead of an exception when lacking write permissions on an existing file lock.
@@ -19,14 +62,42 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Moving `ModelCard` and `TaskCard` abstractions into the main repository.
 - Added a util function `allennlp.nn.util.dist_reduce(...)` for handling distributed reductions.
   This is especially useful when implementing a distributed `Metric`.
+- Added a `FileLock` class to `common.file_utils`. This is just like the `FileLock` from the `filelock` library, except that
+  it adds an optional flag `read_only_ok: bool`, which when set to `True` changes the behavior so that a warning will be emitted
+  instead of an exception when lacking write permissions on an existing file lock.
+  This makes it possible to use the `FileLock` class on a read-only file system.
+- Added a new learning rate scheduler: `CombinedLearningRateScheduler`. This can be used to combine different LR schedulers, using one after the other.
+- Moving `ModelCard` and `TaskCard` abstractions into the main repository.
 
 ### Changed
 
+- `DatasetReader`s are now always lazy. This means there is no `lazy` parameter in the base
+  class, and the `_read()` method should always be a generator.
+- The `DataLoader` now decides whether to load instances lazily or not.
+  With the `PyTorchDataLoader` this is controlled with the `lazy` parameter, but with
+  the `MultiProcessDataLoading` this is controlled by the `max_instances_in_memory` setting.
+- `ArrayField` is now called `TensorField`, and implemented in terms of torch tensors, not numpy.
+- Improved `nn.util.move_to_device` function by avoiding an unnecessary recursive check for tensors and
+  adding a `non_blocking` optional argument, which is the same argument as in `torch.Tensor.to()`.
+- If you are trying to create a heterogeneous batch, you now get a better error message.
+- Readers using the new vision features now explicitly log how they are featurizing images.
+- `master_addr` and `master_port` renamed to `primary_addr` and `primary_port`, respectively.
+- `is_master` parameter for training callbacks renamed to `is_primary`.
+- `master` branch renamed to `main`
+- Torch version bumped to 1.7.1 in Docker images.
 - 'master' branch renamed to 'main'
 - Torch version bumped to 1.7.1 in Docker images.
 
+### Removed
+
+- Removed `nn.util.has_tensor`.
+
 ### Fixed
 
+- The `build-vocab` command no longer crashes when the resulting vocab file is
+  in the current working directory.
+- VQA models now use the `vqa_score` metric for early stopping. This results in
+  much better scores.
 - Fixed typo with `LabelField` string representation: removed trailing apostrophe.
 - `Vocabulary.from_files` and `cached_path` will issue a warning, instead of failing, when a lock on an existing resource
   can't be acquired because the file system is read-only.
@@ -58,11 +129,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
   were not passed to the constructor if the value of the parameter was equal to the default value.
   This caused bugs in some edge cases where a subclass that takes `**kwargs` needs to inspect
   `kwargs` before passing them to its superclass.
-- Improved the band-aid solution for segmentation faults and the "ImportError: dlopen: cannot load any more object with static TLS" 
+- Improved the band-aid solution for segmentation faults and the "ImportError: dlopen: cannot load any more object with static TLS"
   by adding a `transformers` import.
 - Added safety checks for extracting tar files
 - Turned superfluous warning to info when extending the vocab in the embedding matrix, if no pretrained file was provided
 
+
 ## [v1.2.2](https://github.com/allenai/allennlp/releases/tag/v1.2.2) - 2020-11-17
 
 ### Added
@@ -213,6 +285,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Fixed a bug in the cnn_encoder where activations involving masked tokens could be picked up by the max
 - Fix intra word tokenization for `PretrainedTransformerTokenizer` when disabling fast tokenizer.
 
+
 ## [v1.1.0](https://github.com/allenai/allennlp/releases/tag/v1.1.0) - 2020-09-08
 
 ### Fixed
@@ -227,8 +300,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Added
 
-- `Predictor.capture_model_internals()` now accepts a regex specifying
-  which modules to capture
+- `Predictor.capture_model_internals()` now accepts a regex specifying which modules to capture.
 
 
 ## [v1.1.0rc4](https://github.com/allenai/allennlp/releases/tag/v1.1.0rc4) - 2020-08-20
@@ -295,7 +367,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
   in the log output even when `train_parameters` was set to `False`.
 - Fixed a bug with the sharded dataset reader where it would only read a fraction of the instances
   in distributed training.
-- Fixed checking equality of `ArrayField`s.
+- Fixed checking equality of `TensorField`s.
 - Fixed a bug where `NamespaceSwappingField` did not work correctly with `.empty_field()`.
 - Put more sensible defaults on the `huggingface_adamw` optimizer.
 - Simplified logging so that all logging output always goes to one file.
diff --git a/Dockerfile b/Dockerfile
index b62eb4f0e13..886c52609f7 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -18,7 +18,7 @@ LABEL com.nvidia.volumes.needed="nvidia_driver"
 
 WORKDIR /stage/allennlp
 
-# Install torch first. This build arg should be in the form of a version requirement,
+# Install torch ecosystem first. This build arg should be in the form of a version requirement,
 # like 'torch==1.7' or 'torch==1.7+cu102 -f https://download.pytorch.org/whl/torch_stable.html'.
 ARG TORCH
 RUN pip install --no-cache-dir ${TORCH}
diff --git a/Dockerfile.test b/Dockerfile.test
index beb9128fcd6..e91488c67db 100644
--- a/Dockerfile.test
+++ b/Dockerfile.test
@@ -17,7 +17,7 @@ LABEL com.nvidia.volumes.needed="nvidia_driver"
 
 WORKDIR /stage/allennlp
 
-# Install torch first. This build arg should be in the form of a version requirement,
+# Install torch ecosystem first. This build arg should be in the form of a version requirement,
 # like 'torch==1.7' or 'torch==1.7+cu102 -f https://download.pytorch.org/whl/torch_stable.html'.
 ARG TORCH
 RUN pip install --no-cache-dir ${TORCH}
@@ -30,7 +30,7 @@ COPY setup.py .
 COPY dev-requirements.txt .
 RUN touch allennlp/__init__.py \
     && touch README.md \
-    && pip install --no-cache-dir -r dev-requirements.txt -e .
+    && pip install --no-cache-dir -e . -r dev-requirements.txt
 
 # Now add the full package source and re-install just the package.
 COPY . .
diff --git a/Makefile b/Makefile
index b1ced1acd69..1ed1a6b1098 100644
--- a/Makefile
+++ b/Makefile
@@ -13,9 +13,9 @@ MD_DOCS_EXTRAS = $(addprefix $(MD_DOCS_ROOT),README.md CHANGELOG.md CONTRIBUTING
 DOCKER_TAG = latest
 DOCKER_IMAGE_NAME = allennlp/allennlp:$(DOCKER_TAG)
 DOCKER_TEST_IMAGE_NAME = allennlp/test:$(DOCKER_TAG)
-DOCKER_TORCH_VERSION = 'torch==1.7.1'
+DOCKER_TORCH_VERSION = 'torch==1.7.1 torchvision==0.8.2'
 # Our self-hosted runner currently has CUDA 11.0.
-DOCKER_TEST_TORCH_VERSION = 'torch==1.7.1+cu110 -f https://download.pytorch.org/whl/torch_stable.html'
+DOCKER_TEST_TORCH_VERSION = 'torch==1.7.1+cu110 torchvision==0.8.2+cu110 -f https://download.pytorch.org/whl/torch_stable.html'
 DOCKER_RUN_CMD = docker run --rm \
 		-v $$HOME/.allennlp:/root/.allennlp \
 		-v $$HOME/.cache/huggingface:/root/.cache/huggingface \
@@ -85,7 +85,6 @@ install :
 	# Due to a weird thing with pip, we may need egg-info before running `pip install -e`.
 	# See https://github.com/pypa/pip/issues/4537.
 	python setup.py install_egg_info
-	# Install allennlp as editable and all dependencies.
 	pip install --upgrade --upgrade-strategy eager -e . -r dev-requirements.txt
 
 #
diff --git a/README.md b/README.md
index 48adf672abe..a29b3d4e753 100644
--- a/README.md
+++ b/README.md
@@ -148,7 +148,10 @@ to distribute as a plugin, see the [subcommand API docs](https://docs.allennlp.o
 
 ## Installation
 
-AllenNLP requires Python 3.6.1 or later. The preferred way to install AllenNLP is via `pip`.  Just run `pip install allennlp` in your Python environment and you're good to go!
+AllenNLP requires Python 3.6.1 or later and [PyTorch](https://pytorch.org/).
+It's recommended that you install the PyTorch ecosystem **before** installing AllenNLP by following the instructions on [pytorch.org](https://pytorch.org/).
+
+The preferred way to install AllenNLP is via `pip`. Just run `pip install allennlp`.
 
 > ⚠️ If you're using Python 3.7 or greater, you should ensure that you don't have the PyPI version of `dataclasses` installed after running the above command, as this could cause issues on certain platforms. You can quickly check this by running `pip freeze | grep dataclasses`. If you see something like `dataclasses=0.6` in the output, then just run `pip uninstall -y dataclasses`.
 
diff --git a/allennlp/commands/__init__.py b/allennlp/commands/__init__.py
index a0175d2ca66..3a0fba2232f 100644
--- a/allennlp/commands/__init__.py
+++ b/allennlp/commands/__init__.py
@@ -15,6 +15,7 @@
 from allennlp.commands.subcommand import Subcommand
 from allennlp.commands.test_install import TestInstall
 from allennlp.commands.train import Train
+from allennlp.commands.count_instances import CountInstances
 from allennlp.common.plugins import import_plugins
 from allennlp.common.util import import_module_and_submodules
 
diff --git a/allennlp/commands/build_vocab.py b/allennlp/commands/build_vocab.py
index bcab3b466bd..242b5266947 100644
--- a/allennlp/commands/build_vocab.py
+++ b/allennlp/commands/build_vocab.py
@@ -65,7 +65,8 @@ def build_vocab_from_args(args: argparse.Namespace):
         raise RuntimeError(f"{args.output_path} already exists. Use --force to overwrite.")
 
     output_directory = os.path.dirname(args.output_path)
-    os.makedirs(output_directory, exist_ok=True)
+    if len(output_directory) > 0:
+        os.makedirs(output_directory, exist_ok=True)
 
     params = Params.from_file(args.param_path)
 
diff --git a/allennlp/commands/count_instances.py b/allennlp/commands/count_instances.py
new file mode 100644
index 00000000000..8a9fbd10e9b
--- /dev/null
+++ b/allennlp/commands/count_instances.py
@@ -0,0 +1,52 @@
+"""
+Subcommand for counting the number of instances from a training config.
+"""
+
+import argparse
+import logging
+
+from overrides import overrides
+
+from allennlp.commands.subcommand import Subcommand
+from allennlp.common.params import Params
+
+
+logger = logging.getLogger(__name__)
+
+
+@Subcommand.register("count-instances")
+class CountInstances(Subcommand):
+    @overrides
+    def add_subparser(self, parser: argparse._SubParsersAction) -> argparse.ArgumentParser:
+        description = """Count the number of training instances in an experiment config file."""
+        subparser = parser.add_parser(self.name, description=description, help=description)
+        subparser.add_argument("param_path", type=str, help="path to an experiment config file")
+
+        subparser.add_argument(
+            "-o",
+            "--overrides",
+            type=str,
+            default="",
+            help=(
+                "a json(net) structure used to override the experiment configuration, e.g., "
+                "'{\"vocabulary.min_count.labels\": 10}'.  Nested parameters can be specified either"
+                " with nested dictionaries or with dot syntax."
+            ),
+        )
+
+        subparser.set_defaults(func=count_instances_from_args)
+
+        return subparser
+
+
+def count_instances_from_args(args: argparse.Namespace):
+    from allennlp.training.util import data_loaders_from_params
+
+    params = Params.from_file(args.param_path)
+
+    data_loaders = data_loaders_from_params(params, train=True, validation=False, test=False)
+    instances = sum(
+        1 for data_loader in data_loaders.values() for _ in data_loader.iter_instances()
+    )
+
+    print(f"Success! One epoch of training contains {instances} instances.")
diff --git a/allennlp/commands/evaluate.py b/allennlp/commands/evaluate.py
index fced91c2c1b..d0b0692f857 100644
--- a/allennlp/commands/evaluate.py
+++ b/allennlp/commands/evaluate.py
@@ -134,7 +134,15 @@ def evaluate_from_args(args: argparse.Namespace) -> Dict[str, Any]:
 
     evaluation_data_path = args.input_file
     logger.info("Reading evaluation data from %s", evaluation_data_path)
-    instances = dataset_reader.read(evaluation_data_path)
+
+    data_loader_params = config.pop("validation_data_loader", None)
+    if data_loader_params is None:
+        data_loader_params = config.pop("data_loader")
+    if args.batch_size:
+        data_loader_params["batch_size"] = args.batch_size
+    data_loader = DataLoader.from_params(
+        params=data_loader_params, reader=dataset_reader, data_path=evaluation_data_path
+    )
 
     embedding_sources = (
         json.loads(args.embedding_sources_mapping) if args.embedding_sources_mapping else {}
@@ -142,16 +150,10 @@ def evaluate_from_args(args: argparse.Namespace) -> Dict[str, Any]:
 
     if args.extend_vocab:
         logger.info("Vocabulary is being extended with test instances.")
-        model.vocab.extend_from_instances(instances=instances)
+        model.vocab.extend_from_instances(instances=data_loader.iter_instances())
         model.extend_embedder_vocab(embedding_sources)
 
-    instances.index_with(model.vocab)
-    data_loader_params = config.pop("validation_data_loader", None)
-    if data_loader_params is None:
-        data_loader_params = config.pop("data_loader")
-    if args.batch_size:
-        data_loader_params["batch_size"] = args.batch_size
-    data_loader = DataLoader.from_params(dataset=instances, params=data_loader_params)
+    data_loader.index_with(model.vocab)
 
     metrics = evaluate(
         model,
diff --git a/allennlp/commands/find_learning_rate.py b/allennlp/commands/find_learning_rate.py
index e0fa3a301f4..8a1f6380ed4 100644
--- a/allennlp/commands/find_learning_rate.py
+++ b/allennlp/commands/find_learning_rate.py
@@ -20,10 +20,9 @@
 from allennlp.common.checks import check_for_gpu, ConfigurationError
 from allennlp.common.util import prepare_environment
 from allennlp.data import Vocabulary
-from allennlp.data import DataLoader
 from allennlp.models import Model
 from allennlp.training import GradientDescentTrainer, Trainer
-from allennlp.training.util import create_serialization_dir, datasets_from_params
+from allennlp.training.util import create_serialization_dir, data_loaders_from_params
 
 logger = logging.getLogger(__name__)
 
@@ -165,11 +164,11 @@ def find_learning_rate_model(
     # See https://github.com/allenai/allennlp/issues/3658
     assert not distributed_params, "find-lr is not compatible with DistributedDataParallel."
 
-    all_datasets = datasets_from_params(params, serialization_dir=serialization_dir)
-    datasets_for_vocab_creation = set(params.pop("datasets_for_vocab_creation", all_datasets))
+    all_data_loaders = data_loaders_from_params(params)
+    datasets_for_vocab_creation = set(params.pop("datasets_for_vocab_creation", all_data_loaders))
 
     for dataset in datasets_for_vocab_creation:
-        if dataset not in all_datasets:
+        if dataset not in all_data_loaders:
             raise ConfigurationError(f"invalid 'dataset_for_vocab_creation' {dataset}")
 
     logger.info(
@@ -180,18 +179,17 @@ def find_learning_rate_model(
         params.pop("vocabulary", {}),
         instances=(
             instance
-            for key, dataset in all_datasets.items()
-            for instance in dataset
+            for key, data_loader in all_data_loaders.items()
             if key in datasets_for_vocab_creation
+            for instance in data_loader.iter_instances()
         ),
     )
 
-    train_data = all_datasets["train"]
-    train_data.index_with(vocab)
     model = Model.from_params(
         vocab=vocab, params=params.pop("model"), serialization_dir=serialization_dir
     )
-    data_loader = DataLoader.from_params(dataset=train_data, params=params.pop("data_loader"))
+
+    all_data_loaders["train"].index_with(vocab)
 
     trainer_params = params.pop("trainer")
 
@@ -208,7 +206,7 @@ def find_learning_rate_model(
     trainer: GradientDescentTrainer = Trainer.from_params(  # type: ignore
         model=model,
         serialization_dir=serialization_dir,
-        data_loader=data_loader,
+        data_loader=all_data_loaders["train"],
         params=trainer_params,
     )
 
diff --git a/allennlp/commands/train.py b/allennlp/commands/train.py
index 86767559742..33d5df63acb 100644
--- a/allennlp/commands/train.py
+++ b/allennlp/commands/train.py
@@ -266,15 +266,15 @@ def train_model(
             )
         check_for_gpu(device_ids)
 
-        master_addr = distributed_params.pop("master_address", "127.0.0.1")
-        if master_addr in ("127.0.0.1", "0.0.0.0", "localhost"):
+        primary_addr = distributed_params.pop("primary_address", "127.0.0.1")
+        if primary_addr in ("127.0.0.1", "0.0.0.0", "localhost"):
             # If running locally, we can automatically find an open port if one is not specified.
-            master_port = (
-                distributed_params.pop("master_port", None) or common_util.find_open_port()
+            primary_port = (
+                distributed_params.pop("primary_port", None) or common_util.find_open_port()
             )
         else:
             # Otherwise we require that the port be specified.
-            master_port = distributed_params.pop("master_port")
+            primary_port = distributed_params.pop("primary_port")
 
         num_procs = len(device_ids)
         world_size = num_nodes * num_procs
@@ -300,7 +300,7 @@ def train_model(
 
         logging.info(
             "Switching to distributed training mode since multiple GPUs are configured | "
-            f"Master is at: {master_addr}:{master_port} | Rank of this node: {node_rank} | "
+            f"Primary is at: {primary_addr}:{primary_port} | Rank of this node: {node_rank} | "
             f"Number of workers in this node: {num_procs} | Number of nodes: {num_nodes} | "
             f"World size: {world_size}"
         )
@@ -313,8 +313,8 @@ def train_model(
                 include_package,
                 dry_run,
                 node_rank,
-                master_addr,
-                master_port,
+                primary_addr,
+                primary_port,
                 world_size,
                 device_ids,
                 file_friendly_logging,
@@ -337,8 +337,8 @@ def _train_worker(
     include_package: List[str] = None,
     dry_run: bool = False,
     node_rank: int = 0,
-    master_addr: str = "127.0.0.1",
-    master_port: int = 29500,
+    primary_addr: str = "127.0.0.1",
+    primary_port: int = 29500,
     world_size: int = 1,
     distributed_device_ids: List[int] = None,
     file_friendly_logging: bool = False,
@@ -366,10 +366,10 @@ def _train_worker(
         information.
     node_rank : `int`, optional
         Rank of the node.
-    master_addr : `str`, optional (default=`"127.0.0.1"`)
-        Address of the master node for distributed training.
-    master_port : `str`, optional (default=`"29500"`)
-        Port of the master node for distributed training.
+    primary_addr : `str`, optional (default=`"127.0.0.1"`)
+        Address of the primary node for distributed training.
+    primary_port : `str`, optional (default=`"29500"`)
+        Port of the primary node for distributed training.
     world_size : `int`, optional
         The number of processes involved in distributed training.
     distributed_device_ids: `List[str]`, optional
@@ -396,7 +396,7 @@ def _train_worker(
 
     distributed = world_size > 1
 
-    master = process_rank == 0
+    primary = process_rank == 0
 
     include_package = include_package or []
 
@@ -416,7 +416,7 @@ def _train_worker(
         global_rank = node_rank * num_procs_per_node + process_rank
 
         # Number of processes per node is useful to know if a process
-        # is a master in the local node(node in which it is running)
+        # is a primary in the local node(node in which it is running)
         os.environ["ALLENNLP_PROCS_PER_NODE"] = str(num_procs_per_node)
 
         # In distributed training, the configured device is always going to be a list.
@@ -434,14 +434,14 @@ def _train_worker(
             torch.cuda.set_device(int(gpu_id))
             dist.init_process_group(
                 backend="nccl",
-                init_method=f"tcp://{master_addr}:{master_port}",
+                init_method=f"tcp://{primary_addr}:{primary_port}",
                 world_size=world_size,
                 rank=global_rank,
             )
         else:
             dist.init_process_group(
                 backend="gloo",
-                init_method=f"tcp://{master_addr}:{master_port}",
+                init_method=f"tcp://{primary_addr}:{primary_port}",
                 world_size=world_size,
                 rank=global_rank,
             )
@@ -466,7 +466,7 @@ def _train_worker(
         metrics = train_loop.run()
     except KeyboardInterrupt:
         # if we have completed an epoch, try to create a model archive.
-        if master and os.path.exists(os.path.join(serialization_dir, _DEFAULT_WEIGHTS)):
+        if primary and os.path.exists(os.path.join(serialization_dir, _DEFAULT_WEIGHTS)):
             logging.info(
                 "Training interrupted by the user. Attempting to create "
                 "a model archive using the current best epoch weights."
@@ -474,7 +474,7 @@ def _train_worker(
             archive_model(serialization_dir, include_in_archive=include_in_archive)
         raise
 
-    if master:
+    if primary:
         train_loop.finish(metrics)
 
     if not distributed:
@@ -554,16 +554,16 @@ def from_partial_objects(
         serialization_dir: str,
         local_rank: int,
         dataset_reader: DatasetReader,
-        train_data_path: str,
+        train_data_path: Any,
         model: Lazy[Model],
         data_loader: Lazy[DataLoader],
         trainer: Lazy[Trainer],
         vocabulary: Lazy[Vocabulary] = Lazy(Vocabulary),
         datasets_for_vocab_creation: List[str] = None,
         validation_dataset_reader: DatasetReader = None,
-        validation_data_path: str = None,
+        validation_data_path: Any = None,
         validation_data_loader: Lazy[DataLoader] = None,
-        test_data_path: str = None,
+        test_data_path: Any = None,
         evaluate_on_test: bool = False,
         batch_weight_key: str = "",
     ) -> "TrainModel":
@@ -602,57 +602,97 @@ def from_partial_objects(
 
         dataset_reader: `DatasetReader`
             The `DatasetReader` that will be used for training and (by default) for validation.
+
         train_data_path: `str`
             The file (or directory) that will be passed to `dataset_reader.read()` to construct the
             training data.
+
         model: `Lazy[Model]`
             The model that we will train.  This is lazy because it depends on the `Vocabulary`;
             after constructing the vocabulary we call `model.construct(vocab=vocabulary)`.
+
         data_loader: `Lazy[DataLoader]`
             The data_loader we use to batch instances from the dataset reader at training and (by
             default) validation time. This is lazy because it takes a dataset in it's constructor.
+
         trainer: `Lazy[Trainer]`
             The `Trainer` that actually implements the training loop.  This is a lazy object because
             it depends on the model that's going to be trained.
+
         vocabulary: `Lazy[Vocabulary]`, optional (default=`Lazy(Vocabulary)`)
             The `Vocabulary` that we will use to convert strings in the data to integer ids (and
             possibly set sizes of embedding matrices in the `Model`).  By default we construct the
             vocabulary from the instances that we read.
+
         datasets_for_vocab_creation: `List[str]`, optional (default=`None`)
             If you pass in more than one dataset but don't want to use all of them to construct a
             vocabulary, you can pass in this key to limit it.  Valid entries in the list are
             "train", "validation" and "test".
+
         validation_dataset_reader: `DatasetReader`, optional (default=`None`)
             If given, we will use this dataset reader for the validation data instead of
             `dataset_reader`.
+
         validation_data_path: `str`, optional (default=`None`)
             If given, we will use this data for computing validation metrics and early stopping.
+
         validation_data_loader: `Lazy[DataLoader]`, optional (default=`None`)
             If given, the data_loader we use to batch instances from the dataset reader at
             validation and test time. This is lazy because it takes a dataset in it's constructor.
+
         test_data_path: `str`, optional (default=`None`)
             If given, we will use this as test data.  This makes it available for vocab creation by
             default, but nothing else.
+
         evaluate_on_test: `bool`, optional (default=`False`)
             If given, we will evaluate the final model on this data at the end of training.  Note
             that we do not recommend using this for actual test data in every-day experimentation;
             you should only very rarely evaluate your model on actual test data.
+
         batch_weight_key: `str`, optional (default=`""`)
             The name of metric used to weight the loss on a per-batch basis.  This is only used
             during evaluation on final test data, if you've specified `evaluate_on_test=True`.
         """
+        # Train data loader.
+        data_loaders: Dict[str, DataLoader] = {
+            "train": data_loader.construct(reader=dataset_reader, data_path=train_data_path)
+        }
 
-        datasets = training_util.read_all_datasets(
-            train_data_path=train_data_path,
-            dataset_reader=dataset_reader,
-            validation_dataset_reader=validation_dataset_reader,
-            validation_data_path=validation_data_path,
-            test_data_path=test_data_path,
-        )
+        # Validation data loader.
+        if validation_data_path is not None:
+            validation_dataset_reader = validation_dataset_reader or dataset_reader
+            if validation_data_loader is not None:
+                data_loaders["validation"] = validation_data_loader.construct(
+                    reader=validation_dataset_reader, data_path=validation_data_path
+                )
+            else:
+                data_loaders["validation"] = data_loader.construct(
+                    reader=validation_dataset_reader, data_path=validation_data_path
+                )
+                if getattr(data_loaders["validation"], "batches_per_epoch", None) is not None:
+                    warnings.warn(
+                        "Using 'data_loader' params to construct validation data loader since "
+                        "'validation_data_loader' params not specified, but you have "
+                        "'data_loader.batches_per_epoch' set which may result in different "
+                        "validation datasets for each epoch.",
+                        UserWarning,
+                    )
+
+        # Test data loader.
+        if test_data_path is not None:
+            test_dataset_reader = validation_dataset_reader or dataset_reader
+            if validation_data_loader is not None:
+                data_loaders["test"] = validation_data_loader.construct(
+                    reader=test_dataset_reader, data_path=test_data_path
+                )
+            else:
+                data_loaders["test"] = data_loader.construct(
+                    reader=test_dataset_reader, data_path=test_data_path
+                )
 
         if datasets_for_vocab_creation:
             for key in datasets_for_vocab_creation:
-                if key not in datasets:
+                if key not in data_loaders:
                     raise ConfigurationError(f"invalid 'dataset_for_vocab_creation' {key}")
 
             logger.info(
@@ -662,9 +702,9 @@ def from_partial_objects(
 
         instance_generator = (
             instance
-            for key, dataset in datasets.items()
+            for key, data_loader in data_loaders.items()
             if datasets_for_vocab_creation is None or key in datasets_for_vocab_creation
-            for instance in dataset
+            for instance in data_loader.iter_instances()
         )
 
         vocabulary_ = vocabulary.construct(instances=instance_generator)
@@ -672,49 +712,24 @@ def from_partial_objects(
         model_ = model.construct(vocab=vocabulary_, serialization_dir=serialization_dir)
 
         # Initializing the model can have side effect of expanding the vocabulary.
-        # Save the vocab only in the master. In the degenerate non-distributed
-        # case, we're trivially the master. In the distributed case this is safe
+        # Save the vocab only in the primary. In the degenerate non-distributed
+        # case, we're trivially the primary. In the distributed case this is safe
         # to do without worrying about race conditions since saving and loading
         # the vocab involves acquiring a file lock.
         if local_rank == 0:
             vocabulary_path = os.path.join(serialization_dir, "vocabulary")
             vocabulary_.save_to_files(vocabulary_path)
 
-        for dataset in datasets.values():
-            dataset.index_with(model_.vocab)
-
-        data_loader_ = data_loader.construct(dataset=datasets["train"])
-        validation_data = datasets.get("validation")
-        validation_data_loader_: Optional[DataLoader] = None
-        if validation_data is not None:
-            if validation_data_loader is None:
-                validation_data_loader_ = data_loader.construct(dataset=validation_data)
-                if getattr(validation_data_loader_, "_batches_per_epoch", None) is not None:
-                    warnings.warn(
-                        "Using 'data_loader' params to construct validation data loader since "
-                        "'validation_data_loader' params not specified, but you have "
-                        "'data_loader.batches_per_epoch' set which may result in different "
-                        "validation datasets for each epoch.",
-                        UserWarning,
-                    )
-            else:
-                validation_data_loader_ = validation_data_loader.construct(dataset=validation_data)
-
-        test_data = datasets.get("test")
-        test_data_loader: Optional[DataLoader] = None
-        if test_data is not None:
-            if validation_data_loader is None:
-                test_data_loader = data_loader.construct(dataset=test_data)
-            else:
-                test_data_loader = validation_data_loader.construct(dataset=test_data)
+        for data_loader_ in data_loaders.values():
+            data_loader_.index_with(model_.vocab)
 
         # We don't need to pass serialization_dir and local_rank here, because they will have been
         # passed through the trainer by from_params already, because they were keyword arguments to
         # construct this class in the first place.
         trainer_ = trainer.construct(
             model=model_,
-            data_loader=data_loader_,
-            validation_data_loader=validation_data_loader_,
+            data_loader=data_loaders["train"],
+            validation_data_loader=data_loaders.get("validation"),
         )
         assert trainer_ is not None
 
@@ -722,7 +737,7 @@ def from_partial_objects(
             serialization_dir=serialization_dir,
             model=model_,
             trainer=trainer_,
-            evaluation_data_loader=test_data_loader,
+            evaluation_data_loader=data_loaders.get("test"),
             evaluate_on_test=evaluate_on_test,
             batch_weight_key=batch_weight_key,
         )
diff --git a/allennlp/common/file_utils.py b/allennlp/common/file_utils.py
index ae3e2182a36..ecb252fc5e2 100644
--- a/allennlp/common/file_utils.py
+++ b/allennlp/common/file_utils.py
@@ -2,11 +2,14 @@
 Utilities for working with the local dataset cache.
 """
 
+from contextlib import contextmanager
 import glob
+import io
 import os
 import logging
 import tempfile
 import json
+from abc import ABC
 from collections import defaultdict
 from dataclasses import dataclass, asdict
 from datetime import timedelta
@@ -26,24 +29,31 @@
     Iterable,
     Dict,
     NamedTuple,
+    MutableMapping,
 )
 from hashlib import sha256
 from functools import wraps
+from weakref import WeakValueDictionary
 from zipfile import ZipFile, is_zipfile
 import tarfile
 import shutil
+import pickle
 import time
 import warnings
 
 import boto3
 import botocore
+import torch
 from botocore.exceptions import ClientError, EndpointConnectionError
 from filelock import FileLock as _FileLock
+import numpy as np
 from overrides import overrides
 import requests
 from requests.adapters import HTTPAdapter
 from requests.exceptions import ConnectionError
 from requests.packages.urllib3.util.retry import Retry
+import lmdb
+from torch import Tensor
 
 from allennlp.common.tqdm import Tqdm
 
@@ -271,7 +281,11 @@ def cached_path(
             # Normalize the path.
             url_or_filename = os.path.abspath(url_or_filename)
 
-            if extract_archive and (is_zipfile(file_path) or tarfile.is_tarfile(file_path)):
+            if (
+                extract_archive
+                and os.path.isfile(file_path)
+                and (is_zipfile(file_path) or tarfile.is_tarfile(file_path))
+            ):
                 # We'll use a unique directory within the cache to root to extract the archive to.
                 # The name of the directoy is a hash of the resource file path and it's modification
                 # time. That way, if the file changes, we'll know when to extract it again.
@@ -460,6 +474,171 @@ def _find_latest_cached(url: str, cache_dir: Union[str, Path]) -> Optional[str]:
     return None
 
 
+def _serialize(data):
+    buffer = pickle.dumps(data, protocol=-1)
+    return np.frombuffer(buffer, dtype=np.uint8)
+
+
+class TensorCache(MutableMapping[str, Tensor], ABC):
+    """
+    This is a key-value store, mapping strings to tensors. The data is kept on disk,
+    making this class useful as a cache for storing tensors.
+
+    `TensorCache` is also safe to access from multiple processes at the same time, so
+    you can use it in distributed training situations, or from multiple training
+    runs at the same time.
+    """
+
+    def __init__(
+        self,
+        filename: Union[str, PathLike],
+        *,
+        map_size: int = 1024 * 1024 * 1024 * 1024,
+        read_only: bool = False,
+    ) -> None:
+        """
+        Creates a `TensorCache` by either opening an existing one on disk, or creating
+        a new one. Its interface is almost exactly like a Python dictionary, where the
+        keys are strings and the values are `torch.Tensor`.
+
+        Parameters
+        ----------
+        filename: `str`
+            Path to the location of the cache
+        map_size: `int`, optional, defaults to 1TB
+            This is the maximum size the cache will ever grow to. On reasonable operating
+            systems, there is no penalty to making this a large value.
+            `TensorCache` uses a memory-mapped file to store the data. When the file is
+            first opened, we have to give the maximum size it can ever grow to. This is
+            that number. Reasonable operating systems don't actually allocate that space
+            until it is really needed.
+        """
+        filename = str(filename)
+
+        cpu_count = os.cpu_count() or 1
+        if os.path.exists(filename):
+            if os.path.isfile(filename):
+                # If the file is not writable, set read_only to True, but issue a warning.
+                if not os.access(filename, os.W_OK):
+                    if not read_only:
+                        warnings.warn(
+                            f"File '{filename}' is read-only, so cache will be read-only",
+                            UserWarning,
+                        )
+                    read_only = True
+            else:
+                # If it's not a file, raise an error.
+                raise ValueError("Expect a file, found a directory instead")
+
+        use_lock = True
+        if read_only:
+            # Check if the lock file is writable. If it's not, then we won't be able to use the lock.
+
+            # This is always how lmdb names the lock file.
+            lock_filename = filename + "-lock"
+            if os.path.isfile(lock_filename):
+                use_lock = os.access(lock_filename, os.W_OK)
+            else:
+                # If the lock file doesn't exist yet, then the directory needs to be writable in
+                # order to create and use the lock file.
+                use_lock = os.access(os.path.dirname(lock_filename), os.W_OK)
+
+        if not use_lock:
+            warnings.warn(
+                f"Lacking permissions to use lock file on cache '{filename}'.\nUse at your own risk!",
+                UserWarning,
+            )
+
+        self.lmdb_env = lmdb.open(
+            str(filename),
+            subdir=False,
+            map_size=map_size,
+            max_readers=cpu_count * 2,
+            max_spare_txns=cpu_count * 2,
+            metasync=False,
+            sync=True,
+            readahead=False,
+            meminit=False,
+            readonly=read_only,
+            lock=use_lock,
+        )
+
+        # We have another cache here that makes sure we return the same object for the same key. Without it,
+        # you would get a different tensor, using different memory, every time you call __getitem__(), even
+        # if you call it with the same key.
+        # The downside is that we can't keep self.cache_cache up to date when multiple processes modify the
+        # cache at the same time. We can guarantee though that it is up to date as long as processes either
+        # write new values, or read existing ones.
+        self.cache_cache: MutableMapping[str, Tensor] = WeakValueDictionary()
+
+    @property
+    def read_only(self) -> bool:
+        return self.lmdb_env.flags()["readonly"]
+
+    def __contains__(self, key: object):
+        if not isinstance(key, str):
+            return False
+        if key in self.cache_cache:
+            return True
+        encoded_key = key.encode()
+        with self.lmdb_env.begin(write=False) as txn:
+            result = txn.get(encoded_key)
+            return result is not None
+
+    def __getitem__(self, key: str):
+        try:
+            return self.cache_cache[key]
+        except KeyError:
+            encoded_key = key.encode()
+            with self.lmdb_env.begin(write=False) as txn:
+                buffer = txn.get(encoded_key)
+                if buffer is None:
+                    raise KeyError()
+                tensor = torch.load(io.BytesIO(buffer), map_location="cpu")
+            self.cache_cache[key] = tensor
+            return tensor
+
+    def __setitem__(self, key: str, tensor: torch.Tensor):
+        if self.read_only:
+            raise ValueError("cannot write to a read-only cache")
+
+        encoded_key = key.encode()
+        buffer = io.BytesIO()
+        if tensor.storage().size() != np.prod(tensor.size()):
+            tensor = tensor.clone()
+        assert tensor.storage().size() == np.prod(tensor.size())
+        torch.save(tensor.detach(), buffer, pickle_protocol=pickle.HIGHEST_PROTOCOL)
+        with self.lmdb_env.begin(write=True) as txn:
+            txn.put(encoded_key, buffer.getbuffer())
+
+        self.cache_cache[key] = tensor
+
+    def __delitem__(self, key: str):
+        if self.read_only:
+            raise ValueError("cannot write to a read-only cache")
+
+        encoded_key = key.encode()
+        with self.lmdb_env.begin(write=True) as txn:
+            txn.delete(encoded_key)
+
+        try:
+            del self.cache_cache[key]
+        except KeyError:
+            pass
+
+    def __del__(self):
+        if self.lmdb_env is not None:
+            self.lmdb_env.close()
+            self.lmdb_env = None
+
+    def __len__(self):
+        return self.lmdb_env.stat()["entries"]
+
+    def __iter__(self):
+        # It is not hard to implement this, but we have not needed it so far.
+        raise NotImplementedError()
+
+
 class CacheFile:
     """
     This is a context manager that makes robust caching easier.
@@ -472,7 +651,7 @@ class CacheFile:
     """
 
     def __init__(
-        self, cache_filename: Union[Path, str], mode: str = "w+b", suffix: str = ".tmp"
+        self, cache_filename: Union[PathLike, str], mode: str = "w+b", suffix: str = ".tmp"
     ) -> None:
         self.cache_filename = (
             cache_filename if isinstance(cache_filename, Path) else Path(cache_filename)
@@ -502,6 +681,78 @@ def __exit__(self, exc_type, exc_value, traceback):
         return False
 
 
+class LocalCacheResource:
+    """
+    This is a context manager that can be used to fetch and cache arbitrary resources locally
+    using the same mechanisms that `cached_path` uses for remote resources.
+
+    It can be used, for example, when you want to cache the result of an expensive computation.
+
+    # Examples
+
+    ```python
+    with LocalCacheResource("long-computation", "v1") as cache:
+        if cache.cached():
+            with cache.reader() as f:
+                # read from cache
+        else:
+            with cache.writer() as f:
+                # do the computation
+                # ...
+                # write to cache
+    ```
+    """
+
+    def __init__(self, resource_name: str, version: str, cache_dir: str = CACHE_DIRECTORY) -> None:
+        self.resource_name = resource_name
+        self.version = version
+        self.cache_dir = cache_dir
+        self.path = os.path.join(self.cache_dir, _resource_to_filename(resource_name, version))
+        self.file_lock = FileLock(self.path + ".lock")
+
+    def cached(self) -> bool:
+        return os.path.exists(self.path)
+
+    @contextmanager
+    def writer(self, mode="w"):
+        if self.cached():
+            raise ValueError(
+                f"local cache of {self.resource_name} (version '{self.version}') already exists!"
+            )
+
+        with CacheFile(self.path, mode=mode) as f:
+            yield f
+
+        meta = _Meta(
+            resource=self.resource_name,
+            cached_path=self.path,
+            creation_time=time.time(),
+            etag=self.version,
+            size=_get_resource_size(self.path),
+        )
+        meta.to_file()
+
+    @contextmanager
+    def reader(self, mode="r"):
+        if not self.cached():
+            raise ValueError(
+                f"local cache of {self.resource_name} (version '{self.version}') does not exist yet!"
+            )
+
+        with open(self.path, mode) as f:
+            yield f
+
+    def __enter__(self):
+        self.file_lock.acquire()
+        return self
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        self.file_lock.release()
+        if exc_value is None:
+            return True
+        return False
+
+
 @dataclass
 class _Meta:
     """
@@ -670,9 +921,9 @@ def get_file_extension(path: str, dot=True, lower: bool = True):
 
 
 def open_compressed(
-    filename: Union[str, Path], mode: str = "rt", encoding: Optional[str] = "UTF-8", **kwargs
+    filename: Union[str, PathLike], mode: str = "rt", encoding: Optional[str] = "UTF-8", **kwargs
 ):
-    if isinstance(filename, Path):
+    if not isinstance(filename, str):
         filename = str(filename)
     open_fn: Callable = open
 
@@ -684,10 +935,10 @@ def open_compressed(
         import bz2
 
         open_fn = bz2.open
-    return open_fn(filename, mode=mode, encoding=encoding, **kwargs)
+    return open_fn(cached_path(filename), mode=mode, encoding=encoding, **kwargs)
 
 
-def text_lines_from_file(filename: Union[str, Path], strip_lines: bool = True) -> Iterator[str]:
+def text_lines_from_file(filename: Union[str, PathLike], strip_lines: bool = True) -> Iterator[str]:
     with open_compressed(filename, "rt", encoding="UTF-8", errors="replace") as p:
         if strip_lines:
             for line in p:
@@ -696,7 +947,7 @@ def text_lines_from_file(filename: Union[str, Path], strip_lines: bool = True) -
             yield from p
 
 
-def json_lines_from_file(filename: Union[str, Path]) -> Iterable[Union[list, dict]]:
+def json_lines_from_file(filename: Union[str, PathLike]) -> Iterable[Union[list, dict]]:
     return (json.loads(line) for line in text_lines_from_file(filename))
 
 
diff --git a/allennlp/common/testing/__init__.py b/allennlp/common/testing/__init__.py
index b2bd0ae76d8..06168a987f9 100644
--- a/allennlp/common/testing/__init__.py
+++ b/allennlp/common/testing/__init__.py
@@ -10,10 +10,12 @@
 from allennlp.common.testing.model_test_case import ModelTestCase
 from allennlp.common.testing.distributed_test import run_distributed_test
 
+from allennlp.modules.transformer import TransformerModule
+
 from allennlp.training.metrics import Metric
 
 
-_available_devices = ["cpu"] + (["cuda"] if torch.cuda.is_available() else [])
+_available_devices = ["cpu"] + (["cuda:0"] if torch.cuda.is_available() else [])
 
 
 def multi_device(test_method):
@@ -100,3 +102,30 @@ def global_distributed_metric(
         atol = exact[1]
 
     assert_metrics_values(metrics, desired_values, rtol, atol)  # type: ignore
+
+
+def assert_equal_parameters(
+    old_module: torch.nn.Module,
+    new_module: TransformerModule,
+    ignore_missing: bool = False,
+    mapping: Optional[Dict] = None,
+):
+    """
+    Tests if the parameters present in the `new_module` are equal to the ones in `old_module`.
+    Note that any parameters present in the `old_module` that are not present in `new_module`
+    are ignored.
+    """
+    mapping = mapping or {}
+
+    old_parameters = dict(old_module.named_parameters())
+    present_only_in_old = set(old_parameters.keys())
+
+    for name, parameter in new_module.named_parameters():
+        for key, val in mapping.items():
+            name = name.replace(key, val)
+        if ignore_missing:
+            if name not in old_parameters:
+                continue
+        present_only_in_old.remove(name)
+        assert torch.allclose(old_parameters[name], parameter)
+    return present_only_in_old
diff --git a/allennlp/common/testing/distributed_test.py b/allennlp/common/testing/distributed_test.py
index 81ada65184c..7ef00e2e0e8 100644
--- a/allennlp/common/testing/distributed_test.py
+++ b/allennlp/common/testing/distributed_test.py
@@ -14,8 +14,8 @@ def init_process(
     func: Callable,
     func_args: Tuple = None,
     func_kwargs: Dict[str, Any] = None,
-    master_addr: str = "127.0.0.1",
-    master_port: int = 29500,
+    primary_addr: str = "127.0.0.1",
+    primary_port: int = 29500,
 ):
     assert world_size > 1
 
@@ -27,14 +27,14 @@ def init_process(
         torch.cuda.set_device(int(gpu_id))
         dist.init_process_group(
             backend="nccl",
-            init_method=f"tcp://{master_addr}:{master_port}",
+            init_method=f"tcp://{primary_addr}:{primary_port}",
             world_size=world_size,
             rank=global_rank,
         )
     else:
         dist.init_process_group(
             backend="gloo",
-            init_method=f"tcp://{master_addr}:{master_port}",
+            init_method=f"tcp://{primary_addr}:{primary_port}",
             world_size=world_size,
             rank=global_rank,
             timeout=datetime.timedelta(seconds=120),
diff --git a/allennlp/common/testing/model_test_case.py b/allennlp/common/testing/model_test_case.py
index 4086c481914..d920152b2fc 100644
--- a/allennlp/common/testing/model_test_case.py
+++ b/allennlp/common/testing/model_test_case.py
@@ -43,7 +43,7 @@ def set_up_model(
             params["dataset_reader"], serialization_dir=serialization_dir
         )
         # The dataset reader might be lazy, but a lazy list here breaks some of our tests.
-        instances = reader.read(str(dataset_file))
+        instances = list(reader.read(str(dataset_file)))
         # Use parameters for vocabulary if they are present in the config file, so that choices like
         # "non_padded_namespaces", "min_count" etc. can be set if needed.
         if "vocabulary" in params:
@@ -53,14 +53,13 @@ def set_up_model(
             vocab = Vocabulary.from_instances(instances)
         self.vocab = vocab
         self.instances = instances
-        self.instances.index_with(vocab)
         self.model = Model.from_params(
             vocab=self.vocab, params=params["model"], serialization_dir=serialization_dir
         )
 
         # TODO(joelgrus) get rid of these
         # (a lot of the model tests use them, so they'll have to be changed)
-        self.dataset = Batch(list(self.instances))
+        self.dataset = Batch(self.instances)
         self.dataset.index_instances(self.vocab)
 
     def ensure_model_can_train_save_and_load(
@@ -74,6 +73,7 @@ def ensure_model_can_train_save_and_load(
         metric_terminal_value: float = None,
         metric_tolerance: float = 1e-4,
         disable_dropout: bool = True,
+        seed: int = None,
     ):
         """
         # Parameters
@@ -109,6 +109,11 @@ def ensure_model_can_train_save_and_load(
             If True we will set all dropout to 0 before checking gradients. (Otherwise, with small
             datasets, you may get zero gradients because of unlucky dropout.)
         """
+        if seed is not None:
+            random.seed(seed)
+            numpy.random.seed(seed)
+            torch.manual_seed(seed)
+
         save_dir = self.TEST_DIR / "save_and_load_test"
         archive_file = save_dir / "model.tar.gz"
         model = train_model_from_file(param_file, save_dir, overrides=overrides)
@@ -137,21 +142,22 @@ def ensure_model_can_train_save_and_load(
         reader = archive.dataset_reader
         params = Params.from_file(param_file, params_overrides=overrides)
 
-        print("Reading with original model")
-        model_dataset = reader.read(params["validation_data_path"])
-        model_dataset.index_with(model.vocab)
-
-        print("Reading with loaded model")
-        loaded_dataset = reader.read(params["validation_data_path"])
-        loaded_dataset.index_with(loaded_model.vocab)
-
         # Need to duplicate params because DataLoader.from_params will consume.
         data_loader_params = params["data_loader"]
         data_loader_params["shuffle"] = False
         data_loader_params2 = Params(copy.deepcopy(data_loader_params.as_dict()))
 
-        data_loader = DataLoader.from_params(dataset=model_dataset, params=data_loader_params)
-        data_loader2 = DataLoader.from_params(dataset=loaded_dataset, params=data_loader_params2)
+        print("Reading with original model")
+        data_loader = DataLoader.from_params(
+            params=data_loader_params, reader=reader, data_path=params["validation_data_path"]
+        )
+        data_loader.index_with(model.vocab)
+
+        print("Reading with loaded model")
+        data_loader2 = DataLoader.from_params(
+            params=data_loader_params2, reader=reader, data_path=params["validation_data_path"]
+        )
+        data_loader2.index_with(loaded_model.vocab)
 
         # We'll check that even if we index the dataset with each model separately, we still get
         # the same result out.
diff --git a/allennlp/common/util.py b/allennlp/common/util.py
index 1254cbff628..d7ffc8d4bcc 100644
--- a/allennlp/common/util.py
+++ b/allennlp/common/util.py
@@ -27,6 +27,7 @@
     Tuple,
     TypeVar,
     Union,
+    Sequence,
 )
 
 import numpy
@@ -143,7 +144,7 @@ def lazy_groups_of(iterable: Iterable[A], group_size: int) -> Iterator[List[A]]:
 
 
 def pad_sequence_to_length(
-    sequence: List,
+    sequence: Sequence,
     desired_length: int,
     default_value: Callable[[], Any] = lambda: 0,
     padding_on_right: bool = True,
@@ -174,6 +175,7 @@ def pad_sequence_to_length(
 
     padded_sequence : `List`
     """
+    sequence = list(sequence)
     # Truncates the sequence to the desired length.
     if padding_on_right:
         padded_sequence = sequence[:desired_length]
@@ -641,6 +643,64 @@ def format_size(size: int) -> str:
     return f"{size}B"
 
 
+def nan_safe_tensor_divide(numerator, denominator):
+    """Performs division and handles divide-by-zero.
+
+    On zero-division, sets the corresponding result elements to zero.
+    """
+    result = numerator / denominator
+    mask = denominator == 0.0
+    if not mask.any():
+        return result
+
+    # remove nan
+    result[mask] = 0.0
+    return result
+
+
+def shuffle_iterable(i: Iterable[T], pool_size: int = 1024) -> Iterable[T]:
+    import random
+
+    i = iter(i)
+    pool = []
+
+    # fill up the pool
+    for item in i:
+        pool.append(item)
+        if len(pool) >= pool_size:
+            break
+
+    # play in it
+    while len(pool) > 0:
+        index = random.randrange(len(pool))
+        yield pool[index]
+        try:
+            pool[index] = next(i)
+        except StopIteration:
+            del pool[index]
+            break
+
+    # drain it
+    random.shuffle(pool)
+    yield from pool
+
+
+def cycle_iterator_function(iterator_function: Callable[[], Iterable[T]]) -> Iterator[T]:
+    """
+    Functionally equivalent to `itertools.cycle(iterator_function())`, but this function does not
+    cache the result of calling the iterator like `cycle` does.  Instead, we just call
+    `iterator_function()` again whenever we get a `StopIteration`.  This should only be preferred
+    over `itertools.cycle` in cases where you're sure you don't want the caching behavior that's
+    done in `itertools.cycle`.
+    """
+    iterator = iter(iterator_function())
+    while True:
+        try:
+            yield next(iterator)
+        except StopIteration:
+            iterator = iter(iterator_function())
+
+
 def hash_object(o: Any) -> str:
     """Returns a 32-character hash code of arbitrary Python objects."""
     m = hashlib.blake2b()
diff --git a/allennlp/data/__init__.py b/allennlp/data/__init__.py
index 3eadb6d9b14..abbc408ec16 100644
--- a/allennlp/data/__init__.py
+++ b/allennlp/data/__init__.py
@@ -1,14 +1,15 @@
-from allennlp.data.dataloader import DataLoader, PyTorchDataLoader, allennlp_collate
-from allennlp.data.dataset_readers.dataset_reader import (
-    DatasetReader,
-    AllennlpDataset,
-    AllennlpLazyDataset,
+from allennlp.data.data_loaders import (
+    DataLoader,
+    TensorDict,
+    allennlp_collate,
 )
+from allennlp.data.dataset_readers.dataset_reader import DatasetReader
 from allennlp.data.fields.field import DataArray, Field
 from allennlp.data.fields.text_field import TextFieldTensors
 from allennlp.data.instance import Instance
-from allennlp.data.samplers import BatchSampler, Sampler
+from allennlp.data.samplers import BatchSampler
 from allennlp.data.token_indexers.token_indexer import TokenIndexer, IndexedTokenList
 from allennlp.data.tokenizers import Token, Tokenizer
 from allennlp.data.vocabulary import Vocabulary
 from allennlp.data.batch import Batch
+from allennlp.data.image_loader import ImageLoader, TorchImageLoader
diff --git a/allennlp/data/batch.py b/allennlp/data/batch.py
index 72146b04bcf..0a98537c3d3 100644
--- a/allennlp/data/batch.py
+++ b/allennlp/data/batch.py
@@ -4,7 +4,7 @@
 """
 
 import logging
-from collections import defaultdict
+from collections import defaultdict, Counter
 from typing import Dict, Iterable, Iterator, List, Union
 
 import numpy
@@ -39,12 +39,24 @@ def _check_types(self) -> None:
         """
         Check that all the instances have the same types.
         """
-        all_instance_fields_and_types: List[Dict[str, str]] = [
-            {k: v.__class__.__name__ for k, v in x.fields.items()} for x in self.instances
-        ]
-        # Check all the field names and Field types are the same for every instance.
-        if not all(all_instance_fields_and_types[0] == x for x in all_instance_fields_and_types):
-            raise ConfigurationError("You cannot construct a Batch with non-homogeneous Instances.")
+        field_name_to_type_counters: Dict[str, Counter] = defaultdict(lambda: Counter())
+        field_counts: Counter = Counter()
+        for instance in self.instances:
+            for field_name, value in instance.fields.items():
+                field_name_to_type_counters[field_name][value.__class__.__name__] += 1
+                field_counts[field_name] += 1
+        for field_name, type_counters in field_name_to_type_counters.items():
+            if len(type_counters) > 1:
+                raise ConfigurationError(
+                    "You cannot construct a Batch with non-homogeneous Instances. "
+                    f"Field '{field_name}' has {len(type_counters)} different types: "
+                    f"{', '.join(type_counters.keys())}"
+                )
+            if field_counts[field_name] != len(self.instances):
+                raise ConfigurationError(
+                    "You cannot construct a Batch with non-homogeneous Instances. "
+                    f"Field '{field_name}' present in some Instances but not others."
+                )
 
     def get_padding_lengths(self) -> Dict[str, Dict[str, int]]:
         """
@@ -71,7 +83,9 @@ def get_padding_lengths(self) -> Dict[str, Dict[str, int]]:
         return {**padding_lengths}
 
     def as_tensor_dict(
-        self, padding_lengths: Dict[str, Dict[str, int]] = None, verbose: bool = False
+        self,
+        padding_lengths: Dict[str, Dict[str, int]] = None,
+        verbose: bool = False,
     ) -> Dict[str, Union[torch.Tensor, Dict[str, torch.Tensor]]]:
         # This complex return type is actually predefined elsewhere as a DataArray,
         # but we can't use it because mypy doesn't like it.
diff --git a/allennlp/data/data_loaders/__init__.py b/allennlp/data/data_loaders/__init__.py
new file mode 100644
index 00000000000..8c2dfe8776c
--- /dev/null
+++ b/allennlp/data/data_loaders/__init__.py
@@ -0,0 +1,4 @@
+from allennlp.data.data_loaders.data_loader import DataLoader, TensorDict, allennlp_collate
+from allennlp.data.data_loaders.multiprocess_data_loader import MultiProcessDataLoader, WorkerError
+from allennlp.data.data_loaders.multitask_data_loader import MultiTaskDataLoader
+from allennlp.data.data_loaders.simple_data_loader import SimpleDataLoader
diff --git a/allennlp/data/data_loaders/data_loader.py b/allennlp/data/data_loaders/data_loader.py
new file mode 100644
index 00000000000..ce4ce8ca160
--- /dev/null
+++ b/allennlp/data/data_loaders/data_loader.py
@@ -0,0 +1,62 @@
+from typing import List, Dict, Union, Iterator
+
+import torch
+
+from allennlp.common.registrable import Registrable
+from allennlp.data.instance import Instance
+from allennlp.data.batch import Batch
+from allennlp.data.vocabulary import Vocabulary
+
+
+TensorDict = Dict[str, Union[torch.Tensor, Dict[str, torch.Tensor]]]
+"""
+`TensorDict` is the type we use for batches.
+"""
+
+
+def allennlp_collate(instances: List[Instance]) -> TensorDict:
+    """
+    This is the default function used to turn a list of `Instance`s into a `TensorDict`
+    batch.
+    """
+    batch = Batch(instances)
+    return batch.as_tensor_dict()
+
+
+class DataLoader(Registrable):
+    """
+    A `DataLoader` is responsible for generating batches of instances from a
+    [`DatasetReader`](/api/data/dataset_readers/dataset_reader/#datasetreader),
+    or another source of data.
+
+    This is purely an abstract base class. All concrete subclasses must provide
+    implementations of the following methods:
+
+      - [`__iter__()`](#__iter__) that creates an iterable of `TensorDict`s,
+      - [`iter_instances()`](#iter_instances) that creates an iterable of `Instance`s,
+      - [`index_with()`](#index_with) that should index the data with a vocabulary, and
+      - [`set_target_device()`](#set_target_device), which updates the device that batch
+        tensors should be put it when they are generated in `__iter__()`.
+
+    Additionally, this class should also implement `__len__()` when possible.
+
+    The default implementation is
+    [`MultiProcessDataLoader`](../multiprocess_data_loader/#multiprocessdataloader).
+    """
+
+    default_implementation = "multiprocess"
+
+    def __len__(self) -> int:
+        raise TypeError
+
+    def __iter__(self) -> Iterator[TensorDict]:
+        raise NotImplementedError
+
+    def iter_instances(self) -> Iterator[Instance]:
+        raise NotImplementedError
+
+    def index_with(self, vocab: Vocabulary) -> None:
+        raise NotImplementedError
+
+    def set_target_device(self, device: torch.device) -> None:
+        raise NotImplementedError
diff --git a/allennlp/data/data_loaders/multiprocess_data_loader.py b/allennlp/data/data_loaders/multiprocess_data_loader.py
new file mode 100644
index 00000000000..654f6c36bce
--- /dev/null
+++ b/allennlp/data/data_loaders/multiprocess_data_loader.py
@@ -0,0 +1,582 @@
+from collections import deque
+import logging
+from multiprocessing.process import BaseProcess
+import random
+import traceback
+from typing import List, Iterator, Optional, Iterable, Union
+
+from overrides import overrides
+import torch
+import torch.multiprocessing as mp
+
+from allennlp.common.util import lazy_groups_of, shuffle_iterable
+from allennlp.common.tqdm import Tqdm
+from allennlp.data.instance import Instance
+from allennlp.data.data_loaders.data_loader import DataLoader, TensorDict, allennlp_collate
+from allennlp.data.dataset_readers import DatasetReader, WorkerInfo, DatasetReaderInput
+from allennlp.data.fields import TextField
+from allennlp.data.samplers import BatchSampler
+from allennlp.data.vocabulary import Vocabulary
+import allennlp.nn.util as nn_util
+
+
+logger = logging.getLogger(__name__)
+
+
+@DataLoader.register("multiprocess")
+class MultiProcessDataLoader(DataLoader):
+    """
+    The `MultiProcessDataLoader` is a [`DataLoader`](../data_loader/#dataloader)
+    that's optimized for AllenNLP experiments.
+
+    See
+    [Using your reader with multi-process or distributed data loading](/api/data/dataset_readers/dataset_reader/#datasetreader.using_your_reader_with_multi-process_or_distributed_data_loading)
+    for more information on how to optimize your `DatasetReader` for use with this `DataLoader`.
+
+    # Parameters
+
+    reader: `DatasetReader`, required
+        A `DatasetReader` used to load instances from the `data_path`.
+
+    data_path: `DatasetReaderInput`, required
+        Passed to `DatasetReader.read()`.
+
+        !!! Note
+            In a typical AllenNLP configuration file, the `reader` and `data_path` parameters don't
+            get an entry under the `data_loader`. The `reader` is constructed separately from
+            the corresponding `dataset_reader` params, and the `data_path` is taken from the
+            `train_data_path`, `validation_data_path`, or `test_data_path`.
+
+    batch_size: `int`, optional (default = `None`)
+        When `batch_sampler` is unspecified, this option can be combined with `drop_last`
+        and `shuffle` to control automatic batch sampling.
+
+    drop_last: `bool`, optional (default = `False`)
+        When `batch_sampler` is unspecified, this option can be combined with `batch_size`
+        and `shuffle` to control automatic batch sampling.
+
+        If `True`, the last batch will be dropped it doesn't contain a full `batch_size`
+        number of `Instance`s.
+
+    shuffle: `bool`, optional (default = `False`)
+        When `batch_sampler` is unspecified, this option can be combined with `batch_size`
+        and `drop_last` to control automatic batch sampling.
+
+    batch_sampler: `BatchSampler`, optional (default = `None`)
+        A `BatchSampler` to handle batching. This option is mutually exclusive with
+        `batch_size`, `drop_last`, and `shuffle`.
+
+    batches_per_epoch: `int`, optional (default = `None`)
+        If specified, exactly `batches_per_epoch` batches will be generated with each call
+        to `__iter__()`.
+
+    num_workers: `int`, optional (default = `0`)
+        The number of workers to use to read `Instances` in parallel.
+        If `num_workers = 0`, everything is done in the main process. Otherwise `num_workers`
+        workers are forked or spawned (depending on the value of `start_method`), each of which
+        calls `read()` on their copy of the `reader`.
+
+        This means that in order for multi-process loading to be efficient when `num_workers > 1`,
+        the `reader` needs to implement
+        [`manual_multiprocess_sharding`](/api/data/dataset_readers/dataset_reader/#datasetreader).
+
+        !!! Warning
+            Multi-processing code in Python is complicated! We highly recommend you read the short
+            [Best practices](#multiprocessdataloader.best_practices) and
+            [Common issues](#multiprocessdataloader.common_issues) sections below before using this option.
+
+    max_instances_in_memory: `int`, optional (default = `None`)
+        If not specified, all instances will be read and cached in memory for the duration
+        of the data loader's life. This is generally ideal when your data can fit in memory
+        during training. However, when your datasets are too big, using this option
+        will turn on lazy loading, where only `max_instances_in_memory` instances are processed
+        at a time.
+
+        !!! Note
+            This setting will affect how a `batch_sampler` is applied. If
+            `max_instances_in_memory` is `None`, the sampler will be applied to all `Instances`.
+            Otherwise the sampler will be applied to only `max_instances_in_memory` `Instances`
+            at a time.
+
+            Therefore when using this option with a sampler, you should generally set it to a multiple of
+            the sampler's `batch_size` (if it has one).
+
+    start_method: `str`, optional (default = `"fork"`)
+        The [start method](https://docs.python.org/3.7/library/multiprocessing.html#contexts-and-start-methods)
+        used to spin up workers.
+
+        On Linux or OS X, "fork" usually has the lowest overhead for starting workers
+        but could potentially lead to dead-locks if you're using lower-level libraries that are not fork-safe.
+
+        If you run into these issues, try using "spawn" instead.
+
+    cuda_device: `Optional[Union[int, str, torch.device]]`, optional (default = `None`)
+        If given, batches will automatically be put on this device.
+
+        !!! Note
+            This should typically not be set in an AllenNLP configuration file. The `Trainer`
+            will automatically call [`set_target_device()`](#set_target_device) before iterating
+            over batches.
+
+    # Best practices
+
+    - **Large datasets**
+
+        If your dataset is too big to fit into memory (a common problem), you'll need to load it lazily.
+        This is done by simply setting the `max_instances_in_memory` parameter to a non-zero integer.
+        The optimal value depends on your use case.
+
+        If you're using a `batch_sampler`, you will generally get better samples by setting
+        `max_instances_in_memory` to a higher number - such as 10 to 100 times your batch size -
+        since this determines how many `Instances` your `batch_sampler` gets to sample from at a time.
+
+        If you're not using a `batch_sampler` then this number is much less important. Setting it to
+        2 to 10 times your batch size is a reasonable value.
+
+        Keep in mind that using `max_instances_in_memory` generally results in a slower
+        training loop unless you load data in worker processes by setting the `num_workers` option to a
+        non-zero integer (see below). That way data loading won't block the main process.
+
+    - **Performance**
+
+        The quickest way to increase the performance of data loading is adjust the `num_workers` parameter.
+        `num_workers` determines how many workers are used to read `Instances` from your
+        `DatasetReader`. By default, this is set to `0`, which means everything is done in the main process.
+
+        Before trying to set `num_workers` to a non-zero number, you should make sure your `DatasetReader`
+        is [optimized for use with multi-process data loading]
+        (/api/data/dataset_readers/dataset_reader/#datasetreader.using_your_reader_with_multi-process_or_distributed_data_loading).
+
+    # Common issues
+
+    - **Dead-locks**
+
+        Multiprocessing code in Python is complicated! Especially code that involves lower-level libraries
+        which may be spawning their own threads. If you run into dead-locks while
+        using `num_workers > 0`, luckily there are two simple work-arounds which usually fix the issue.
+
+        The first work-around is to disable parallelism for these low-level libraries.
+        For example, setting the environment variables `OMP_NUM_THREADS=1` and `TOKENIZERS_PARALLELISM=0`
+        will do so for PyTorch and Numpy (for CPU operations) and HuggingFace Tokenizers, respectively.
+
+        Alternatively, changing the `start_method` to "spawn" (when available, depending on your OS)
+        may fix your issues without disabling parallelism for other libraries.
+
+        See [issue #4848](https://github.com/allenai/allennlp/issues/4848) for more info.
+
+        Dead-locks could also be caused by running out of shared memory (see below).
+
+    - **Shared memory restrictions**
+
+        Tensors are passed between processes using shared memory, and some systems impose strict
+        limits on the allowed size of shared memory.
+
+        Luckily this is simple to debug and simple to fix.
+
+        First, to verify that this is your issue just watch your shared memory as your data loader runs.
+        For example, run `watch -n 0.3 'df -h | grep shm'`.
+
+        If you're seeing your shared memory blow up until it maxes-out, then you either need to decrease
+        `max_instances_in_memory` or increase your system's `ulimit`.
+
+        If you're using Docker, you can increase the shared memory available on a container by running
+        it with the option `--ipc=host` or by setting `--shm-size`.
+
+        See [issue #4847](https://github.com/allenai/allennlp/issues/4847) for more info.
+
+    """  # noqa: E501
+
+    def __init__(
+        self,
+        reader: DatasetReader,
+        data_path: DatasetReaderInput,
+        *,
+        batch_size: int = None,
+        drop_last: bool = False,
+        shuffle: bool = False,
+        batch_sampler: BatchSampler = None,
+        batches_per_epoch: int = None,
+        num_workers: int = 0,
+        max_instances_in_memory: int = None,
+        start_method: str = "fork",
+        cuda_device: Optional[Union[int, str, torch.device]] = None,
+    ) -> None:
+        # Do some parameter validation.
+        if num_workers is not None and num_workers < 0:
+            raise ValueError("num_workers cannot be a negative number")
+
+        if batch_size is not None and batch_size < 1:
+            raise ValueError("batch_size must be at least 1")
+
+        if batch_sampler is not None:
+            if batch_size is not None:
+                raise ValueError("batch_sampler option is mutually exclusive with batch_size")
+
+            if drop_last:
+                raise ValueError("batch_sampler option is mutually exclusive with drop_last")
+
+            if shuffle:
+                raise ValueError("batch_sampler option is mutually exclusive with shuffle")
+        elif batch_size is None:
+            raise ValueError("batch_size is required when batch_sampler is not supplied")
+
+        if batches_per_epoch is not None and batches_per_epoch < 1:
+            raise ValueError("batches_per_epoch must be at least 1")
+
+        if max_instances_in_memory is not None:
+            if batch_size is not None and max_instances_in_memory < batch_size:
+                raise ValueError("max_instances_in_memory must be at least batch_size")
+            elif max_instances_in_memory < 1:
+                raise ValueError("max_instances_in_memory must be at least 1")
+
+        self.reader = reader
+        self.data_path = data_path
+        self.batch_size = batch_size
+        self.drop_last = drop_last
+        self.shuffle = shuffle
+        self.batch_sampler = batch_sampler
+        self.batches_per_epoch = batches_per_epoch
+        self.num_workers = num_workers
+        self.collate_fn = allennlp_collate
+        self.max_instances_in_memory = max_instances_in_memory
+        self.start_method = start_method
+        self.cuda_device: Optional[torch.device] = None
+        if cuda_device is not None:
+            if not isinstance(cuda_device, torch.device):
+                self.cuda_device = torch.device(cuda_device)
+            else:
+                self.cuda_device = cuda_device
+
+        # Can only initialize CUDA in workers when these `start_methods` are used.
+        self._worker_cuda_safe = self.start_method in {"spawn", "forkserver"}
+
+        # To make sure we have some backpressure in the worker queues we try to set
+        # reasonable defaults for the maximum size of these queues.
+        # They have to be big enough that is doesn't hurt performance, but small enough
+        # that they don't take up too many resources when there is a bottleneck on the
+        # consuming end of a queue.
+        effective_batch_size = (
+            self.batch_size if self.batch_sampler is None else self.batch_sampler.get_batch_size()
+        )
+        self._max_instance_queue_size = (
+            None
+            if max_instances_in_memory is None
+            else 2 * self.num_workers * max_instances_in_memory
+        )
+        self._max_batch_queue_size = (
+            None
+            if max_instances_in_memory is None
+            else 2 * self.num_workers * max_instances_in_memory // (effective_batch_size or 1)
+        )
+
+        # If max_instances_in_memory is not given, we'll keep a cache of all instances in this list.
+        self._instances: Optional[List[Instance]] = None
+        # Keeps track of state when `batches_per_epoch` is used.
+        self._batch_generator: Optional[Iterator[TensorDict]] = None
+        # For indexing instances.
+        self._vocab: Optional[Vocabulary] = None
+
+        if self.max_instances_in_memory is None:
+            # Load all instances right away.
+            deque(self.iter_instances(), maxlen=0)
+
+    @overrides
+    def index_with(self, vocab: Vocabulary) -> None:
+        self._vocab = vocab
+        if self._instances:
+            for instance in self._instances:
+                instance.index_fields(vocab)
+
+    @overrides
+    def __len__(self) -> int:
+        if self.batches_per_epoch is not None:
+            return self.batches_per_epoch
+        elif self.max_instances_in_memory is None:
+            # We haven't read the instances yet, so we do so now, caching them as we go.
+            if not self._instances:
+                deque(self.iter_instances(), maxlen=0)
+
+            if self.batch_sampler is not None:
+                return self.batch_sampler.get_num_batches(self._instances)  # type: ignore
+
+            num_instances = len(self._instances)  # type: ignore
+            # We know batch_size won't be None here since `batch_sampler` is None.
+            batch_size: int = self.batch_size  # type: ignore
+            if self.drop_last or num_instances % batch_size == 0:
+                return num_instances // batch_size
+            else:
+                return 1 + num_instances // batch_size
+        else:
+            # We can't know the number of batches for a lazy loader when batches_per_epoch
+            # is not specified.
+            raise TypeError
+
+    @overrides
+    def __iter__(self) -> Iterator[TensorDict]:
+        if self._vocab is None:
+            raise ValueError(
+                "This DataLoader has not been indexed with a Vocabulary yet. "
+                "Did you forget to call DataLoader.index_with(vocab)?"
+            )
+
+        if self.batches_per_epoch is None:
+            yield from self._iter_batches()
+        else:
+            if self._batch_generator is None:
+                self._batch_generator = self._iter_batches()
+            for i in range(self.batches_per_epoch):
+                try:
+                    yield next(self._batch_generator)
+                except StopIteration:  # data_generator is exhausted
+                    self._batch_generator = self._iter_batches()  # so refresh it
+                    yield next(self._batch_generator)
+
+    @overrides
+    def iter_instances(self) -> Iterator[Instance]:
+        if self._instances:
+            yield from self._instances
+        else:
+            if self.max_instances_in_memory is None:
+                self._instances = []
+
+            if self.num_workers <= 0:
+                # Just read all instances in main process.
+                for instance in Tqdm.tqdm(
+                    self.reader.read(self.data_path), desc="loading instances"
+                ):
+                    self.reader.apply_token_indexers(instance)
+                    if self.max_instances_in_memory is None:
+                        self._instances.append(instance)  # type: ignore
+                    if self._vocab is not None:
+                        instance.index_fields(self._vocab)
+                    yield instance
+            else:
+                ctx = mp.get_context(self.start_method)
+                queue: mp.JoinableQueue = (
+                    ctx.JoinableQueue()
+                    if self._max_instance_queue_size is None
+                    else ctx.JoinableQueue(maxsize=self._max_instance_queue_size)
+                )
+                workers = self._start_instance_workers(queue, ctx)
+
+                try:
+                    for instance in Tqdm.tqdm(
+                        self._gather_instances(queue), desc="loading instances"
+                    ):
+                        if self.max_instances_in_memory is None:
+                            self._instances.append(instance)  # type: ignore
+                        yield instance
+                finally:
+                    if hasattr(queue, "close"):  # for compat with different Python versions.
+                        queue.close()  # type: ignore[attr-defined]
+                    self._join_workers(workers, queue)
+
+    @overrides
+    def set_target_device(self, device: torch.device) -> None:
+        self.cuda_device = device
+
+    def _iter_batches(self) -> Iterator[TensorDict]:
+        if self._instances is not None or self.num_workers <= 0:
+            for batch in self._instances_to_batches(self.iter_instances(), move_to_device=True):
+                yield batch
+        else:
+            ctx = mp.get_context(self.start_method)
+
+            queue: mp.JoinableQueue = (
+                ctx.JoinableQueue()
+                if self._max_batch_queue_size is None
+                else ctx.JoinableQueue(maxsize=self._max_batch_queue_size)
+            )
+            workers = self._start_batch_workers(queue, ctx)
+
+            try:
+                # We can now start consuming from the `queue` as the batch workers
+                # produce batches.
+                done_count: int = 0
+                while done_count < self.num_workers:
+                    for batch, worker_error in iter(queue.get, (None, None)):
+                        if worker_error is not None:
+                            e, tb = worker_error
+                            raise WorkerError(e, tb)
+
+                        if not self._worker_cuda_safe and self.cuda_device is not None:
+                            # Need to move batch to target device now.
+                            batch = nn_util.move_to_device(batch, self.cuda_device)
+                        yield batch
+                        queue.task_done()
+                    done_count += 1
+            finally:
+                if hasattr(queue, "close"):  # for compat with different Python versions.
+                    queue.close()  # type: ignore[attr-defined]
+                self._join_workers(workers, queue)
+
+    def _start_instance_workers(self, queue: mp.JoinableQueue, ctx) -> List[BaseProcess]:
+        workers: List[BaseProcess] = []
+        for worker_id in range(self.num_workers):
+            worker: BaseProcess = ctx.Process(
+                target=self._instance_worker, args=(worker_id, queue), daemon=True
+            )
+            worker.start()
+            workers.append(worker)
+        return workers
+
+    def _start_batch_workers(self, queue: mp.JoinableQueue, ctx) -> List[BaseProcess]:
+        workers: List[BaseProcess] = []
+        for worker_id in range(self.num_workers):
+            worker: BaseProcess = ctx.Process(
+                target=self._batch_worker, args=(worker_id, queue), daemon=True
+            )
+            worker.start()
+            workers.append(worker)
+        return workers
+
+    def _join_workers(self, workers: List[BaseProcess], queue) -> None:
+        # Each worker will be blocking on a call to `queue.join()`,
+        # calling `queue.task_done()` times the number of workers will
+        # call the `queue.join()` to return, and each worker should exit on its own.
+        for _ in range(len(workers)):
+            try:
+                queue.task_done()
+            except ValueError:
+                # This happens if a worker died early.
+                break
+        # If for some reason the workers don't exit properly, we go through and terminate
+        # them anyway.
+        for worker in workers:
+            if worker.is_alive():
+                worker.terminate()
+
+    def _instance_worker(self, worker_id: int, queue: mp.JoinableQueue) -> None:
+        try:
+            self.reader._set_worker_info(WorkerInfo(self.num_workers, worker_id))
+            instances = self.reader.read(self.data_path)
+            checked_for_token_indexers: bool = False
+            for instance in instances:
+                # Check the first instance to make sure it doesn't contain any TextFields with
+                # token_indexers because we don't want to be duplicating those by sending
+                # them across processes.
+                if not checked_for_token_indexers:
+                    for field_name, field in instance.fields.items():
+                        if isinstance(field, TextField) and field._token_indexers is not None:
+                            raise ValueError(
+                                f"Found a TextField ({field_name}) with token_indexers already "
+                                "applied, but you're using num_workers > 0 in your data loader. "
+                                "Make sure your dataset reader's text_to_instance() method doesn't "
+                                "add any token_indexers to the TextFields it creates. Instead, the token_indexers "
+                                "should be added to the instances in the apply_token_indexers() method of your "
+                                "dataset reader (which you'll have to implement if you haven't done "
+                                "so already)."
+                            )
+                    checked_for_token_indexers = True
+                queue.put((instance, None))
+        except Exception as e:
+            queue.put((None, (repr(e), traceback.format_exc())))
+
+        # Indicate to the consumer that this worker is finished.
+        queue.put((None, None))
+
+        # Wait until this process can safely exit.
+        queue.join()
+
+    def _batch_worker(self, worker_id: int, queue: mp.JoinableQueue) -> None:
+        try:
+            self.reader._set_worker_info(WorkerInfo(self.num_workers, worker_id))
+            instances = self.reader.read(self.data_path)
+            for batch in self._instances_to_batches(
+                instances, move_to_device=self._worker_cuda_safe
+            ):
+                queue.put((batch, None))
+        except Exception as e:
+            queue.put((None, (repr(e), traceback.format_exc())))
+
+        # Indicate to the consumer (main thread) that this worker is finished.
+        queue.put((None, None))
+
+        # Wait until this process can safely exit.
+        queue.join()
+
+    def _gather_instances(self, queue: mp.JoinableQueue) -> Iterable[Instance]:
+        done_count: int = 0
+        while done_count < self.num_workers:
+            for instance, worker_error in iter(queue.get, (None, None)):
+                if worker_error is not None:
+                    e, tb = worker_error
+                    raise WorkerError(e, tb)
+
+                self.reader.apply_token_indexers(instance)
+                if self._vocab is not None:
+                    instance.index_fields(self._vocab)
+                yield instance
+                queue.task_done()
+            done_count += 1
+
+    def _index_instance(self, instance: Instance) -> Instance:
+        self.reader.apply_token_indexers(instance)
+        assert self._vocab is not None
+        instance.index_fields(self._vocab)
+        return instance
+
+    def _instances_to_batches(
+        self, instance_iterator: Iterable[Instance], move_to_device
+    ) -> Iterator[TensorDict]:
+        instance_iterator = (self._index_instance(instance) for instance in instance_iterator)
+
+        if move_to_device and self.cuda_device is not None:
+            tensorize = lambda batch: nn_util.move_to_device(  # noqa: E731
+                self.collate_fn(batch), self.cuda_device
+            )
+        else:
+            tensorize = self.collate_fn
+
+        if self.batch_sampler is not None:
+            instance_chunks: Iterable[List[Instance]]
+
+            if self.max_instances_in_memory is not None:
+                instance_chunks = lazy_groups_of(instance_iterator, self.max_instances_in_memory)
+            else:
+                instance_chunks = [list(instance_iterator)]
+
+            for instances in instance_chunks:
+                batches = (
+                    [instances[i] for i in batch_indices]
+                    for batch_indices in self.batch_sampler.get_batch_indices(instances)
+                )
+                for batch in batches:
+                    yield tensorize(batch)
+        else:
+            # Safe to assume this is not `None` when `self.batch_sampler` is `None`.
+            assert self.batch_size is not None
+
+            if self.shuffle:
+                if self.max_instances_in_memory is not None:
+                    instance_iterator = shuffle_iterable(
+                        instance_iterator,
+                        self.max_instances_in_memory,
+                    )
+                else:
+                    # At this point we've already loaded the instances in memory and indexed them,
+                    # so this won't take long.
+                    instance_iterator = list(instance_iterator)
+                    random.shuffle(instance_iterator)
+
+            for batch in lazy_groups_of(instance_iterator, self.batch_size):
+                if self.drop_last and len(batch) < self.batch_size:
+                    break
+                yield tensorize(batch)
+
+
+class WorkerError(Exception):
+    """
+    An error raised when a worker fails.
+    """
+
+    def __init__(self, original_err_repr: str, traceback: List[str]) -> None:
+        super().__init__(
+            f"worker raised {original_err_repr}\n\n"
+            "  Traceback from worker:\n  " + "".join(traceback)
+            # Remove the first line of the traceback since it's redundant.
+            .replace("Traceback (most recent call last):\n", "")
+            # Give a little indentation so it's clear this traceback is separate from the traceback
+            # in the main process.
+            .replace("\n", "\n  ")
+        )
diff --git a/allennlp/data/data_loaders/multitask_data_loader.py b/allennlp/data/data_loaders/multitask_data_loader.py
new file mode 100644
index 00000000000..222bd7d8324
--- /dev/null
+++ b/allennlp/data/data_loaders/multitask_data_loader.py
@@ -0,0 +1,291 @@
+from typing import Any, Dict, Iterable, Iterator, Union, Optional
+import itertools
+import math
+
+import torch
+from overrides import overrides
+
+from allennlp.common import util
+from allennlp.data.dataset_readers.dataset_reader import DatasetReader, DatasetReaderInput
+from allennlp.data.batch import Batch
+from allennlp.data.data_loaders.data_loader import DataLoader, TensorDict
+from allennlp.data.data_loaders.multiprocess_data_loader import MultiProcessDataLoader
+from allennlp.data.data_loaders.multitask_scheduler import MultiTaskScheduler
+from allennlp.data.data_loaders.multitask_epoch_sampler import MultiTaskEpochSampler
+from allennlp.data.dataset_readers.multitask import MultiTaskDatasetReader
+from allennlp.data.instance import Instance
+from allennlp.data.vocabulary import Vocabulary
+import allennlp.nn.util as nn_util
+
+
+def maybe_shuffle_instances(loader: DataLoader, shuffle: bool) -> Iterable[Instance]:
+    if shuffle:
+        return util.shuffle_iterable(loader.iter_instances())
+    else:
+        return loader.iter_instances()
+
+
+@DataLoader.register("multitask")
+class MultiTaskDataLoader(DataLoader):
+    """
+    A `DataLoader` intended for multi-task learning.  The basic idea is that you use a
+    `MultiTaskDatasetReader`, which takes a dictionary of `DatasetReaders`, keyed by some name.  You
+    use those same names for various parameters here, including the data paths that get passed to
+    each reader.  We will load each dataset and iterate over instances in them using a
+    `MultiTaskEpochSampler` and a `MultiTaskScheduler`.  The `EpochSampler` says how much to use
+    from each dataset at each epoch, and the `Scheduler` orders the instances in the epoch however
+    you want.  Both of these are designed to be used in conjunction with trainer `Callbacks`, if
+    desired, to have the sampling and/or scheduling behavior be dependent on the current state of
+    training.
+
+    While it is not necessarily required, this `DatasetReader` was designed to be used alongside a
+    `MultiTaskModel`, which can handle instances coming from different datasets.  If your datasets
+    are similar enough (say, they are all reading comprehension datasets with the same format), or
+    your model is flexible enough, then you could feasibly use this `DataLoader` with a normal,
+    non-multitask `Model`.
+
+    Registered as a `DataLoader` with name "multitask".
+
+    # Parameters
+
+    reader: `MultiTaskDatasetReader`
+    data_path: `Dict[str, str]`
+        One file per underlying dataset reader in the `MultiTaskDatasetReader`, which will be passed
+        to those readers to construct one `DataLoader` per dataset.
+    scheduler: `MultiTaskScheduler`, optional (default = `HomogeneousRoundRobinScheduler`)
+        The `scheduler` determines how instances are ordered within an epoch.  By default, we'll
+        select one batch of instances from each dataset in turn, trying to ensure as uniform a mix
+        of datasets as possible.  Note that if your model can handle it, using a
+        `RoundRobinScheduler` is likely better than a `HomogeneousRoundRobinScheduler` (because it
+        does a better job mixing gradient signals from various datasets), so you may want to
+        consider switching.  We use the homogeneous version as default because it should work for
+        any allennlp model, while the non-homogeneous one might not.
+    sampler: `MultiTaskEpochSampler`, optional (default = `None`)
+        Only used if `instances_per_epoch` is not `None`. If we need to select a subset of the data
+        for an epoch, this `sampler` will tell us with what proportion we should sample from each
+        dataset.  For instance, we might want to focus more on datasets that are underperforming in
+        some way, by having those datasets contribute more instances this epoch than other datasets.
+    instances_per_epoch: `int`, optional (default = `None`)
+        If not `None`, we will use this many instances per epoch of training, drawing from the
+        underlying datasets according to the `sampler`.
+    num_workers: `Dict[str, int]`, optional (default = `None`)
+        Used when creating one `MultiProcessDataLoader` per dataset.  If you want non-default
+        behavior for this parameter in the `DataLoader` for a particular dataset, pass the
+        corresponding value here, keyed by the dataset name.
+    max_instances_in_memory: `Dict[str, int]`, optional (default = `None`)
+        Used when creating one `MultiProcessDataLoader` per dataset.  If you want non-default
+        behavior for this parameter in the `DataLoader` for a particular dataset, pass the
+        corresponding value here, keyed by the dataset name.
+    start_method: `Dict[str, str]`, optional (default = `None`)
+        Used when creating one `MultiProcessDataLoader` per dataset.  If you want non-default
+        behavior for this parameter in the `DataLoader` for a particular dataset, pass the
+        corresponding value here, keyed by the dataset name.
+    instance_queue_size: `Dict[str, int]`, optional (default = `None`)
+        Used when creating one `MultiProcessDataLoader` per dataset.  If you want non-default
+        behavior for this parameter in the `DataLoader` for a particular dataset, pass the
+        corresponding value here, keyed by the dataset name.
+    instance_chunk_size: `Dict[str, int]`, optional (default = `None`)
+        Used when creating one `MultiProcessDataLoader` per dataset.  If you want non-default
+        behavior for this parameter in the `DataLoader` for a particular dataset, pass the
+        corresponding value here, keyed by the dataset name.
+    shuffle: `bool`, optional (default = `True`)
+        If `False`, we will not shuffle the instances that come from each underlying data loader.
+        You almost certainly never want to use this except when debugging.
+    cuda_device: `Optional[Union[int, str, torch.device]]`, optional (default = `None`)
+        If given, batches will automatically be put on this device.
+
+        !!! Note
+            This should typically not be set in an AllenNLP configuration file. The `Trainer`
+            will automatically call [`set_target_device()`](#set_target_device) before iterating
+            over batches.
+    """
+
+    def __init__(
+        self,
+        reader: MultiTaskDatasetReader,
+        data_path: Dict[str, str],
+        scheduler: MultiTaskScheduler,
+        *,
+        sampler: MultiTaskEpochSampler = None,
+        instances_per_epoch: int = None,
+        num_workers: Dict[str, int] = None,
+        max_instances_in_memory: Dict[str, int] = None,
+        start_method: Dict[str, str] = None,
+        instance_queue_size: Dict[str, int] = None,
+        instance_chunk_size: Dict[str, int] = None,
+        shuffle: bool = True,
+        cuda_device: Optional[Union[int, str, torch.device]] = None,
+    ) -> None:
+        self.readers = reader.readers
+        self.data_paths = data_path
+        self.scheduler = scheduler
+        self.sampler = sampler
+        self.cuda_device: Optional[torch.device] = None
+        if cuda_device is not None:
+            if not isinstance(cuda_device, torch.device):
+                self.cuda_device = torch.device(cuda_device)
+            else:
+                self.cuda_device = cuda_device
+
+        self._instances_per_epoch = instances_per_epoch
+        self._shuffle = shuffle
+
+        if instances_per_epoch is not None and sampler is None:
+            raise ValueError(
+                "You must provide an EpochSampler if you want to not use all instances every epoch."
+            )
+
+        self._num_workers = num_workers or {}
+        self._max_instances_in_memory = max_instances_in_memory or {}
+        self._start_method = start_method or {}
+        self._instance_queue_size = instance_queue_size or {}
+        self._instance_chunk_size = instance_chunk_size or {}
+
+        if self.readers.keys() != self.data_paths.keys():
+            raise ValueError(
+                f"Mismatch between readers ({self.readers.keys()}) and data paths "
+                f"({self.data_paths.keys()})"
+            )
+        self._loaders = {key: self._make_data_loader(key) for key in self.readers}
+
+        # This stores our current iterator with each dataset, so we don't just iterate over the
+        # first k instances every epoch if we're using instances_per_epoch.  We'll grab instances
+        # from here each epoch, and refresh it when it runs out.  We only use this in the case that
+        # instances_per_epoch is not None, but these iterators are lazy, so always creating them
+        # doesn't hurt anything.
+        self._iterators: Dict[str, Iterator[Instance]] = {
+            # NOTE: The order in which we're calling these iterator functions is important.  We want
+            # an infinite iterator over the data, but we want the order in which we iterate over the
+            # data to be different at every epoch.  The cycle function will give us an infinite
+            # iterator, and it will call the lambda function each time it runs out of instances,
+            # which will produce a new shuffling of the dataset.
+            key: util.cycle_iterator_function(
+                # This default argument to the lambda function is necessary to create a new scope
+                # for the loader variable, so a _different_ loader gets saved for every iterator.
+                # Dictionary comprehensions don't create new scopes in python.  If you don't have
+                # this loader, you end up with `loader` always referring to the last loader in the
+                # iteration... mypy also doesn't know what to do with this, for some reason I can't
+                # figure out.
+                lambda l=loader: maybe_shuffle_instances(l, self._shuffle)  # type: ignore
+            )
+            for key, loader in self._loaders.items()
+        }
+
+    @overrides
+    def __len__(self) -> int:
+        if self._instances_per_epoch is None:
+            # This will raise a TypeError if any of the underlying loaders doesn't have a length,
+            # which is actually what we want.
+            return self.scheduler.count_batches(
+                {dataset: len(loader) for dataset, loader in self._loaders.items()}
+            )
+        else:
+            return self.scheduler.count_batches(
+                {dataset: self._instances_per_epoch for dataset in self._loaders.keys()}
+            )
+
+    @overrides
+    def __iter__(self) -> Iterator[TensorDict]:
+        epoch_instances = self._get_instances_for_epoch()
+        return (
+            nn_util.move_to_device(
+                Batch(instances).as_tensor_dict(),
+                -1 if self.cuda_device is None else self.cuda_device,
+            )
+            for instances in self.scheduler.batch_instances(epoch_instances)
+        )
+
+    @overrides
+    def iter_instances(self) -> Iterator[Instance]:
+        # The only external contract for this method is that it iterates over instances
+        # individually; it doesn't actually specify anything about batching or anything else.  The
+        # implication is that you iterate over all instances in the dataset, in an arbitrary order.
+        # The only external uses of this method are in vocabulary construction (the
+        # MultiProcessDataLoader uses this function internally when constructing batches, but that's
+        # an implementation detail).
+        #
+        # So, the only thing we need to do here is iterate over all instances from all datasets, and
+        # that's sufficient.  We won't be using this for batching, because that requires some
+        # complex, configurable scheduling.
+        #
+        # The underlying data loaders here could be using multiprocessing; we don't need to worry
+        # about that in this class. Caching is also handled by the underlying data loaders.
+        for loader in self._loaders.values():
+            yield from loader.iter_instances()
+
+    @overrides
+    def index_with(self, vocab: Vocabulary) -> None:
+        for loader in self._loaders.values():
+            loader.index_with(vocab)
+
+    @overrides
+    def set_target_device(self, device: torch.device) -> None:
+        self.cuda_device = device
+
+    def _get_instances_for_epoch(self) -> Dict[str, Iterable[Instance]]:
+        if self._instances_per_epoch is None:
+            return {
+                key: maybe_shuffle_instances(loader, self._shuffle)
+                for key, loader in self._loaders.items()
+            }
+        if self.sampler is None:
+            # We already checked for this in the constructor, so this should never happen unless you
+            # modified the object after creation. But mypy is complaining, so here's another check.
+            raise ValueError(
+                "You must specify an EpochSampler if self._instances_per_epoch is not None."
+            )
+        dataset_proportions = self.sampler.get_task_proportions(self._loaders)
+        proportion_sum = sum(dataset_proportions.values())
+        num_instances_per_dataset = {
+            key: math.floor(proportion * self._instances_per_epoch / proportion_sum)
+            for key, proportion in dataset_proportions.items()
+        }
+        return {
+            key: itertools.islice(self._iterators[key], num_instances)
+            for key, num_instances in num_instances_per_dataset.items()
+        }
+
+    def _make_data_loader(self, key: str) -> MultiProcessDataLoader:
+        kwargs: Dict[str, Any] = {
+            "reader": _MultitaskDatasetReaderShim(self.readers[key], key),
+            "data_path": self.data_paths[key],
+            # We don't load batches from this data loader, only instances, but we have to set
+            # something for the batch size, so we set 1.
+            "batch_size": 1,
+        }
+        if key in self._num_workers:
+            kwargs["num_workers"] = self._num_workers[key]
+        if key in self._max_instances_in_memory:
+            kwargs["max_instances_in_memory"] = self._max_instances_in_memory[key]
+        if key in self._start_method:
+            kwargs["start_method"] = self._start_method[key]
+        return MultiProcessDataLoader(**kwargs)
+
+
+@DatasetReader.register("multitask_shim")
+class _MultitaskDatasetReaderShim(DatasetReader):
+    """This dataset reader wraps another dataset reader and adds the name of the "task" into
+    each instance as a metadata field. This exists only to support `MultitaskDataLoader`. You
+    should not have to use this yourself."""
+
+    def __init__(self, inner: DatasetReader, head: str, **kwargs):
+        super().__init__(**kwargs)
+        self.inner = inner
+        self.head = head
+
+    def read(self, file_path: DatasetReaderInput) -> Iterator[Instance]:
+        from allennlp.data.fields import MetadataField
+
+        for instance in self.inner.read(file_path):
+            instance.add_field("task", MetadataField(self.head))
+            yield instance
+
+    def text_to_instance(self, *inputs) -> Instance:
+        from allennlp.data.fields import MetadataField
+
+        instance = self.inner.text_to_instance(*inputs)
+        instance.add_field("task", MetadataField(self.head))
+        return instance
+
+    def apply_token_indexers(self, instance: Instance) -> None:
+        self.inner.apply_token_indexers(instance)
diff --git a/allennlp/data/data_loaders/multitask_epoch_sampler.py b/allennlp/data/data_loaders/multitask_epoch_sampler.py
new file mode 100644
index 00000000000..b3850c0742a
--- /dev/null
+++ b/allennlp/data/data_loaders/multitask_epoch_sampler.py
@@ -0,0 +1,84 @@
+from typing import Any, Dict, Mapping
+
+from allennlp.common.registrable import Registrable
+from allennlp.data.data_loaders.data_loader import DataLoader
+
+
+class MultiTaskEpochSampler(Registrable):
+    """
+    A class that determines with what proportion each dataset should be sampled for a given epoch.
+    This is used by the `MultiTaskDataLoader`.  The main output of this class is the task proportion
+    dictionary returned by `get_task_proportions`, which specifies what percentage of the instances
+    for the current epoch should come from each dataset.  To control this behavior as training
+    progresses, there is an `update_from_epoch_metrics` method, which should be called from a
+    `Callback` during training.
+    """
+
+    def get_task_proportions(self, data_loaders: Mapping[str, DataLoader]) -> Dict[str, float]:
+        """
+        Given a dictionary of `DataLoaders` for each dataset, returns what percentage of the
+        instances for the current epoch of training should come from each dataset.  The input
+        dictionary could be used to determine how many datasets there are (e.g., for uniform
+        sampling) or how big each dataset is (e.g., for sampling based on size), or it could be
+        ignored entirely.
+        """
+        raise NotImplementedError
+
+    def update_from_epoch_metrics(self, epoch_metrics: Dict[str, Any]) -> None:
+        """
+        Some implementations of EpochSamplers change their behavior based on current epoch metrics.
+        This method is meant to be called from a `Callback`, to let the sampler update its sampling
+        proportions.  If your sampling technique does not depend on epoch metrics, you do not need
+        to implement this method.
+        """
+        raise NotImplementedError
+
+
+@MultiTaskEpochSampler.register("uniform")
+class UniformSampler(MultiTaskEpochSampler):
+    """
+    Returns a uniform distribution over datasets at every epoch.
+
+    Registered as a `MultiTaskEpochSampler` with name "uniform".
+    """
+
+    def get_task_proportions(self, data_loaders: Mapping[str, DataLoader]) -> Dict[str, float]:
+        return {key: 1 / len(data_loaders) for key in data_loaders}
+
+
+@MultiTaskEpochSampler.register("weighted")
+class WeightedSampler(MultiTaskEpochSampler):
+    """
+    Returns a weighted distribution over datasets at every epoch, where every
+    task has a weight.
+
+    Registered as a `MultiTaskEpochSampler` with name "weighted".
+    """
+
+    def __init__(self, weights: Dict[str, float]):
+        self.weights = weights
+
+    def get_task_proportions(self, data_loaders: Mapping[str, DataLoader]) -> Dict[str, float]:
+        total = sum(self.weights[task] for task in data_loaders.keys())
+        return {task: self.weights[task] / total for task in data_loaders.keys()}
+
+
+@MultiTaskEpochSampler.register("proportional")
+class ProportionalSampler(MultiTaskEpochSampler):
+    """
+    Samples from every dataset according to its size.  This will have essentially the same effect as
+    using all of the data at every epoch, but it lets you control for number of instances per epoch,
+    if you want to do that.  This requires that all data loaders have a `__len__` (which means no
+    lazy loading).  If you need this functionality with lazy loading, implement your own sampler
+    that takes dataset sizes as a constructor parameter.
+
+    Registered as a `MultiTaskEpochSampler` with name "proportional".
+    """
+
+    def get_task_proportions(self, data_loaders: Mapping[str, DataLoader]) -> Dict[str, float]:
+        try:
+            sizes = {key: len(loader) for key, loader in data_loaders.items()}
+        except TypeError:
+            raise ValueError("ProportionalSampler got passed a data loader without a length")
+        total_size = sum(sizes.values())
+        return {key: size / total_size for key, size in sizes.items()}
diff --git a/allennlp/data/data_loaders/multitask_scheduler.py b/allennlp/data/data_loaders/multitask_scheduler.py
new file mode 100644
index 00000000000..f77d070f498
--- /dev/null
+++ b/allennlp/data/data_loaders/multitask_scheduler.py
@@ -0,0 +1,134 @@
+from collections import defaultdict
+from typing import Any, Dict, Iterable, Union, List, Mapping
+
+import more_itertools
+
+from allennlp.common.registrable import Registrable
+from allennlp.data.instance import Instance
+
+
+class MultiTaskScheduler(Registrable):
+    """
+    A class that determines how to order instances within an epoch.
+    This is used by the `MultiTaskDataLoader`. The main operation performed by this class is to
+    take a dictionary of instance iterators, one for each dataset, and combine them into an
+    iterator of batches, based on some scheduling algorithm (such as round robin, randomly choosing
+    between available datasets, etc.). To control this behavior as training progresses, there is an
+    `update_from_epoch_metrics` method available, which should be called from a `Callback` during
+    training.  Not all `MultiTaskSchedulers` will implement this method.
+    """
+
+    def batch_instances(
+        self, epoch_instances: Dict[str, Iterable[Instance]]
+    ) -> Iterable[List[Instance]]:
+        """
+        Given a dictionary of `Iterable[Instance]` for each dataset, combines them into an
+        `Iterable` of batches of instances.
+        """
+        raise NotImplementedError
+
+    def update_from_epoch_metrics(self, epoch_metrics: Dict[str, Any]) -> None:
+        """
+        In case you want to set the behavior of the scheduler based on current epoch metrics, you
+        can do that by calling this method from a `Callback`.  If your scheduling technique does not
+        depend on epoch metrics, you do not need to implement this method.
+        """
+        raise NotImplementedError
+
+    def count_batches(self, dataset_counts: Dict[str, int]) -> int:
+        """
+        Given the number of instances per dataset, this returns the total number of batches
+        the scheduler will return.
+        """
+        raise NotImplementedError
+
+    default_implementation = "homogeneous_roundrobin"
+
+
+def _chunked_iterator(i: Iterable, chunk_size: int, drop_last: bool):
+    chunks = more_itertools.chunked(i, chunk_size)
+    if drop_last:
+        return (chunk for chunk in chunks if len(chunk) == chunk_size)
+    else:
+        return chunks
+
+
+@MultiTaskScheduler.register("roundrobin")
+class RoundRobinScheduler(MultiTaskScheduler):
+    """
+    Orders instances in a round-robin fashion, where we take one instance from every dataset in
+    turn. When one dataset runs out, we continue iterating round-robin through the rest.
+
+    Registered as a `MultiTaskScheduler` with name "roundrobin".
+    """
+
+    def __init__(self, batch_size: int, drop_last: bool = False):
+        super().__init__()
+        self.batch_size = batch_size
+        self.drop_last = drop_last
+
+    def batch_instances(
+        self, epoch_instances: Dict[str, Iterable[Instance]]
+    ) -> Iterable[List[Instance]]:
+        return _chunked_iterator(
+            more_itertools.roundrobin(*epoch_instances.values()), self.batch_size, self.drop_last
+        )
+
+    def count_batches(self, dataset_counts: Dict[str, int]) -> int:
+        instance_count = sum(dataset_counts.values())
+        if self.drop_last or instance_count % self.batch_size == 0:
+            return instance_count // self.batch_size
+        else:
+            return 1 + (instance_count // self.batch_size)
+
+
+@MultiTaskScheduler.register("homogeneous_roundrobin")
+class HomogeneousRoundRobinScheduler(MultiTaskScheduler):
+    """
+    Orders instances in a round-robin fashion, but grouped into batches composed entirely of
+    instances from one dataset.  We'll return one batch from one dataset, then another batch from a
+    different dataset, etc.  This is currently necessary in AllenNLP if your instances have
+    different fields for different datasets, as we can't currently combine instances with different
+    fields.
+
+    When one dataset runs out, we continue iterating round-robin through the rest.
+
+    If you want more fine-grained control over which datasets can be combined, it should be
+    relatively straightforward to write your own scheduler, following this logic, which allows some
+    datasets to be combined and others not.
+
+    Registered as a `MultiTaskScheduler` with name "homogeneous_roundrobin".
+
+    # Parameters
+
+    batch_size: `Union[int, Dict[str, int]]`
+        Determines how many instances to group together in each dataset.  If this is an `int`, the
+        same value is used for all datasets; otherwise, the keys must correspond to the dataset
+        names used elsewhere in the multi-task code.
+    """
+
+    def __init__(self, batch_size: Union[int, Dict[str, int]], drop_last: bool = False):
+        self.batch_size: Mapping[str, int]
+        if isinstance(batch_size, int):
+            self.batch_size = defaultdict(lambda: batch_size)  # type: ignore
+        else:
+            self.batch_size = batch_size
+        self.drop_last = drop_last
+
+    def batch_instances(
+        self, epoch_instances: Dict[str, Iterable[Instance]]
+    ) -> Iterable[List[Instance]]:
+        chunked_iterators = [
+            _chunked_iterator(iterator, self.batch_size[dataset], self.drop_last)
+            for dataset, iterator in epoch_instances.items()
+        ]
+        return more_itertools.roundrobin(*chunked_iterators)
+
+    def count_batches(self, dataset_counts: Dict[str, int]) -> int:
+        result = 0
+        for dataset, count in dataset_counts.items():
+            batch_size = self.batch_size[dataset]
+            result += count // batch_size
+            if not self.drop_last and count % batch_size != 0:
+                result += 1
+        return result
diff --git a/allennlp/data/data_loaders/simple_data_loader.py b/allennlp/data/data_loaders/simple_data_loader.py
new file mode 100644
index 00000000000..26b66b30893
--- /dev/null
+++ b/allennlp/data/data_loaders/simple_data_loader.py
@@ -0,0 +1,92 @@
+import math
+import random
+from typing import Optional, List, Iterator
+
+from overrides import overrides
+import torch
+
+from allennlp.common.util import lazy_groups_of
+from allennlp.data.data_loaders.data_loader import DataLoader, allennlp_collate, TensorDict
+from allennlp.data.dataset_readers import DatasetReader
+from allennlp.data.instance import Instance
+from allennlp.data.vocabulary import Vocabulary
+import allennlp.nn.util as nn_util
+
+
+@DataLoader.register("simple", constructor="from_dataset_reader")
+class SimpleDataLoader(DataLoader):
+    """
+    A very simple `DataLoader` that is mostly used for testing.
+    """
+
+    def __init__(
+        self,
+        instances: List[Instance],
+        batch_size: int,
+        *,
+        shuffle: bool = False,
+        batches_per_epoch: Optional[int] = None,
+        vocab: Optional[Vocabulary] = None,
+    ) -> None:
+        self.instances = instances
+        self.batch_size = batch_size
+        self.shuffle = shuffle
+        self.batches_per_epoch = batches_per_epoch
+        self.vocab = vocab
+        self.cuda_device: Optional[torch.device] = None
+        self._batch_generator: Optional[Iterator[TensorDict]] = None
+
+    def __len__(self) -> int:
+        return math.ceil(len(self.instances) / self.batch_size)
+
+    @overrides
+    def __iter__(self) -> Iterator[TensorDict]:
+        if self.batches_per_epoch is None:
+            yield from self._iter_batches()
+        else:
+            if self._batch_generator is None:
+                self._batch_generator = self._iter_batches()
+            for i in range(self.batches_per_epoch):
+                try:
+                    yield next(self._batch_generator)
+                except StopIteration:  # data_generator is exhausted
+                    self._batch_generator = self._iter_batches()  # so refresh it
+                    yield next(self._batch_generator)
+
+    def _iter_batches(self) -> Iterator[TensorDict]:
+        if self.shuffle:
+            random.shuffle(self.instances)
+        for batch in lazy_groups_of(self.iter_instances(), self.batch_size):
+            tensor_dict = allennlp_collate(batch)
+            if self.cuda_device is not None:
+                tensor_dict = nn_util.move_to_device(tensor_dict, self.cuda_device)
+            yield tensor_dict
+
+    @overrides
+    def iter_instances(self) -> Iterator[Instance]:
+        for instance in self.instances:
+            if self.vocab is not None:
+                instance.index_fields(self.vocab)
+            yield instance
+
+    @overrides
+    def index_with(self, vocab: Vocabulary) -> None:
+        self.vocab = vocab
+        for instance in self.instances:
+            instance.index_fields(self.vocab)
+
+    @overrides
+    def set_target_device(self, device: torch.device) -> None:
+        self.cuda_device = device
+
+    @classmethod
+    def from_dataset_reader(
+        cls,
+        reader: DatasetReader,
+        data_path: str,
+        batch_size: int,
+        shuffle: bool = False,
+        batches_per_epoch: Optional[int] = None,
+    ) -> "SimpleDataLoader":
+        instances = list(reader.read(data_path))
+        return cls(instances, batch_size, shuffle=shuffle, batches_per_epoch=batches_per_epoch)
diff --git a/allennlp/data/dataloader.py b/allennlp/data/dataloader.py
deleted file mode 100644
index e6f84a95f18..00000000000
--- a/allennlp/data/dataloader.py
+++ /dev/null
@@ -1,162 +0,0 @@
-from typing import List, Dict, Union, Iterator
-
-import torch
-from torch.utils import data
-
-from allennlp.common.registrable import Registrable
-from allennlp.common.lazy import Lazy
-from allennlp.data.instance import Instance
-from allennlp.data.batch import Batch
-from allennlp.data.samplers import Sampler, BatchSampler
-
-
-TensorDict = Dict[str, Union[torch.Tensor, Dict[str, torch.Tensor]]]
-
-
-def allennlp_collate(instances: List[Instance]) -> TensorDict:
-    batch = Batch(instances)
-    return batch.as_tensor_dict(batch.get_padding_lengths())
-
-
-class DataLoader(Registrable):
-    """
-    A `DataLoader` is responsible for generating batches of instances from a `Dataset`,
-    or another source of data. This is essentially just an abstraction over `torch.utils.data.DataLoader`.
-
-    This class only has one required method, `__iter__()`, that creates an iterable
-    of `TensorDict`s. Additionally, this class comes with a `__len__()` method
-    that just raises a `TypeError` by default. When possible, this should be overriden
-    to return the number of batches that will be generated by the `__iter__()` method.
-    """
-
-    default_implementation = "pytorch_dataloader"
-
-    def __len__(self) -> int:
-        raise TypeError
-
-    def __iter__(self) -> Iterator[TensorDict]:
-        raise NotImplementedError
-
-
-@DataLoader.register("pytorch_dataloader", constructor="from_partial_objects")
-class PyTorchDataLoader(data.DataLoader, DataLoader):
-    """
-    A registrable version of the pytorch
-    [DataLoader](https://pytorch.org/docs/stable/data.html#torch.utils.data.DataLoader).
-    Firstly, this class exists is so that we can construct a DataLoader
-    from a configuration file and have a different default `collate_fn`.
-    You can use this class directly in python code, but it is identical to using
-    pytorch dataloader with allennlp's custom collate function:
-
-    ```
-    from torch.utils.data import DataLoader
-
-    from allennlp.data import allennlp_collate
-    # Construct a dataloader directly for a dataset which contains allennlp
-    # Instances which have _already_ been indexed.
-    my_loader = DataLoader(dataset, batch_size=32, collate_fn=allennlp_collate)
-    ```
-
-    Secondly, this class adds a `batches_per_epoch` parameter which, if given, determines the number
-    of batches after which an epoch ends.  If this is `None`, then an epoch is set to be one full pass
-    through your data.  You might use this if you have a very large dataset and want more frequent
-    checkpoints and evaluations on validation data, for instance.
-
-    In a typical AllenNLP configuration file, the `dataset` parameter does not get an entry under
-    the "data_loader", it gets constructed separately.
-    """
-
-    def __init__(
-        self,
-        dataset: data.Dataset,
-        batch_size: int = 1,
-        shuffle: bool = False,
-        sampler: Sampler = None,
-        batch_sampler: BatchSampler = None,
-        num_workers: int = 0,
-        # NOTE: The default for collate_fn is different from the normal `None`.
-        # We assume that if you are using this class you are using an
-        # allennlp dataset of instances, which would require this.
-        collate_fn=allennlp_collate,
-        pin_memory: bool = False,
-        drop_last: bool = False,
-        timeout: int = 0,
-        worker_init_fn=None,
-        multiprocessing_context: str = None,
-        batches_per_epoch: int = None,
-    ):
-        super().__init__(
-            dataset=dataset,
-            batch_size=batch_size,
-            shuffle=shuffle,
-            sampler=sampler,
-            batch_sampler=batch_sampler,
-            num_workers=num_workers,
-            collate_fn=collate_fn,
-            pin_memory=pin_memory,
-            drop_last=drop_last,
-            timeout=timeout,
-            worker_init_fn=worker_init_fn,
-            multiprocessing_context=multiprocessing_context,
-        )
-        self._data_generator = super().__iter__()
-        self._batches_per_epoch = batches_per_epoch
-
-    def __len__(self):
-        if self._batches_per_epoch is not None:
-            return self._batches_per_epoch
-        return super().__len__()
-
-    def __iter__(self):
-        if self._batches_per_epoch is None:
-            # NOTE: since torch's DataLoader is listed as the first super class of this class,
-            # super().__iter__() will resolve to the __iter__ method from torch's DataLoader,
-            # which is what we want.
-            yield from super().__iter__()
-        else:
-            for i in range(self._batches_per_epoch):
-                try:
-                    yield next(self._data_generator)
-                except StopIteration:  # data_generator is exhausted
-                    self._data_generator = super().__iter__()  # so refresh it
-                    yield next(self._data_generator)  # and yield required instance
-
-    @classmethod
-    def from_partial_objects(
-        cls,
-        dataset: data.Dataset,
-        batch_size: int = 1,
-        shuffle: bool = False,
-        sampler: Lazy[Sampler] = None,
-        batch_sampler: Lazy[BatchSampler] = None,
-        num_workers: int = 0,
-        pin_memory: bool = False,
-        drop_last: bool = False,
-        timeout: int = 0,
-        worker_init_fn=None,
-        multiprocessing_context: str = None,
-        batches_per_epoch: int = None,
-    ) -> "PyTorchDataLoader":
-        batch_sampler_ = (
-            None if batch_sampler is None else batch_sampler.construct(data_source=dataset)
-        )
-        sampler_ = None if sampler is None else sampler.construct(data_source=dataset)
-
-        return cls(
-            dataset=dataset,
-            batch_size=batch_size,
-            shuffle=shuffle,
-            sampler=sampler_,
-            batch_sampler=batch_sampler_,
-            num_workers=num_workers,
-            # NOTE: The default for collate_fn is different from the normal `None`.
-            # We assume that if you are using this class you are using an
-            # allennlp dataset of instances, which would require this.
-            collate_fn=allennlp_collate,
-            pin_memory=pin_memory,
-            drop_last=drop_last,
-            timeout=timeout,
-            worker_init_fn=worker_init_fn,
-            multiprocessing_context=multiprocessing_context,
-            batches_per_epoch=batches_per_epoch,
-        )
diff --git a/allennlp/data/dataset_readers/__init__.py b/allennlp/data/dataset_readers/__init__.py
index 716e41439ee..274d9d7e4ee 100644
--- a/allennlp/data/dataset_readers/__init__.py
+++ b/allennlp/data/dataset_readers/__init__.py
@@ -7,14 +7,15 @@
 """
 
 
-from allennlp.data.dataset_readers.conll2003 import Conll2003DatasetReader
 from allennlp.data.dataset_readers.dataset_reader import (
     DatasetReader,
-    AllennlpDataset,
-    AllennlpLazyDataset,
+    WorkerInfo,
+    DatasetReaderInput,
 )
+from allennlp.data.dataset_readers.babi import BabiReader
+from allennlp.data.dataset_readers.conll2003 import Conll2003DatasetReader
 from allennlp.data.dataset_readers.interleaving_dataset_reader import InterleavingDatasetReader
+from allennlp.data.dataset_readers.multitask import MultiTaskDatasetReader
 from allennlp.data.dataset_readers.sequence_tagging import SequenceTaggingDatasetReader
 from allennlp.data.dataset_readers.sharded_dataset_reader import ShardedDatasetReader
-from allennlp.data.dataset_readers.babi import BabiReader
 from allennlp.data.dataset_readers.text_classification_json import TextClassificationJsonReader
diff --git a/allennlp/data/dataset_readers/babi.py b/allennlp/data/dataset_readers/babi.py
index 31e90379a2b..5ee6cdd3e25 100644
--- a/allennlp/data/dataset_readers/babi.py
+++ b/allennlp/data/dataset_readers/babi.py
@@ -4,7 +4,7 @@
 from overrides import overrides
 
 from allennlp.common.file_utils import cached_path
-from allennlp.data.dataset_readers.dataset_reader import DatasetReader
+from allennlp.data.dataset_readers.dataset_reader import DatasetReader, PathOrStr
 from allennlp.data.instance import Instance
 from allennlp.data.fields import Field, TextField, ListField, IndexField
 from allennlp.data.token_indexers import TokenIndexer, SingleIdTokenIndexer
@@ -45,7 +45,7 @@ def __init__(
         self._token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer()}
 
     @overrides
-    def _read(self, file_path: str):
+    def _read(self, file_path: PathOrStr):
         # if `file_path` is a URL, redirect to the cache
         file_path = cached_path(file_path)
 
@@ -85,22 +85,29 @@ def text_to_instance(
 
         if self._keep_sentences:
             context_field_ks = ListField(
-                [
-                    TextField([Token(word) for word in line], self._token_indexers)
-                    for line in context
-                ]
+                [TextField([Token(word) for word in line]) for line in context]
             )
 
             fields["supports"] = ListField(
                 [IndexField(support, context_field_ks) for support in supports]
             )
         else:
-            context_field = TextField(
-                [Token(word) for line in context for word in line], self._token_indexers
-            )
+            context_field = TextField([Token(word) for line in context for word in line])
 
         fields["context"] = context_field_ks if self._keep_sentences else context_field
-        fields["question"] = TextField([Token(word) for word in question], self._token_indexers)
-        fields["answer"] = TextField([Token(answer)], self._token_indexers)
+        fields["question"] = TextField(
+            [Token(word) for word in question],
+        )
+        fields["answer"] = TextField([Token(answer)])
 
         return Instance(fields)
+
+    @overrides
+    def apply_token_indexers(self, instance: Instance) -> None:
+        if self._keep_sentences:
+            for text_field in instance.fields["context"]:  # type: ignore
+                text_field._token_indexers = self._token_indexers  # type: ignore
+        else:
+            instance.fields["context"]._token_indexers = self._token_indexers  # type: ignore
+        instance.fields["question"]._token_indexers = self._token_indexers  # type: ignore
+        instance.fields["answer"]._token_indexers = self._token_indexers  # type: ignore
diff --git a/allennlp/data/dataset_readers/conll2003.py b/allennlp/data/dataset_readers/conll2003.py
index 42f62e1b24d..9f0fe289d85 100644
--- a/allennlp/data/dataset_readers/conll2003.py
+++ b/allennlp/data/dataset_readers/conll2003.py
@@ -6,7 +6,7 @@
 
 from allennlp.common.checks import ConfigurationError
 from allennlp.common.file_utils import cached_path
-from allennlp.data.dataset_readers.dataset_reader import DatasetReader
+from allennlp.data.dataset_readers.dataset_reader import DatasetReader, PathOrStr
 from allennlp.data.dataset_readers.dataset_utils import to_bioul
 from allennlp.data.fields import TextField, SequenceLabelField, Field, MetadataField
 from allennlp.data.instance import Instance
@@ -88,7 +88,9 @@ def __init__(
         label_namespace: str = "labels",
         **kwargs,
     ) -> None:
-        super().__init__(**kwargs)
+        super().__init__(
+            manual_distributed_sharding=True, manual_multiprocess_sharding=True, **kwargs
+        )
         self._token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer()}
         if tag_label is not None and tag_label not in self._VALID_LABELS:
             raise ConfigurationError("unknown tag label type: {}".format(tag_label))
@@ -105,26 +107,30 @@ def __init__(
         self._original_coding_scheme = "IOB1"
 
     @overrides
-    def _read(self, file_path: str) -> Iterable[Instance]:
+    def _read(self, file_path: PathOrStr) -> Iterable[Instance]:
         # if `file_path` is a URL, redirect to the cache
         file_path = cached_path(file_path)
 
         with open(file_path, "r") as data_file:
             logger.info("Reading instances from lines in file at: %s", file_path)
 
-            # Group into alternative divider / sentence chunks.
-            for is_divider, lines in itertools.groupby(data_file, _is_divider):
+            # Group lines into sentence chunks based on the divider.
+            line_chunks = (
+                lines
+                for is_divider, lines in itertools.groupby(data_file, _is_divider)
                 # Ignore the divider chunks, so that `lines` corresponds to the words
                 # of a single sentence.
-                if not is_divider:
-                    fields = [line.strip().split() for line in lines]
-                    # unzipping trick returns tuples, but our Fields need lists
-                    fields = [list(field) for field in zip(*fields)]
-                    tokens_, pos_tags, chunk_tags, ner_tags = fields
-                    # TextField requires `Token` objects
-                    tokens = [Token(token) for token in tokens_]
+                if not is_divider
+            )
+            for lines in self.shard_iterable(line_chunks):
+                fields = [line.strip().split() for line in lines]
+                # unzipping trick returns tuples, but our Fields need lists
+                fields = [list(field) for field in zip(*fields)]
+                tokens_, pos_tags, chunk_tags, ner_tags = fields
+                # TextField requires `Token` objects
+                tokens = [Token(token) for token in tokens_]
 
-                    yield self.text_to_instance(tokens, pos_tags, chunk_tags, ner_tags)
+                yield self.text_to_instance(tokens, pos_tags, chunk_tags, ner_tags)
 
     def text_to_instance(  # type: ignore
         self,
@@ -137,7 +143,7 @@ def text_to_instance(  # type: ignore
         We take `pre-tokenized` input here, because we don't have a tokenizer in this class.
         """
 
-        sequence = TextField(tokens, self._token_indexers)
+        sequence = TextField(tokens)
         instance_fields: Dict[str, Field] = {"tokens": sequence}
         instance_fields["metadata"] = MetadataField({"words": [x.text for x in tokens]})
 
@@ -192,3 +198,7 @@ def text_to_instance(  # type: ignore
             )
 
         return Instance(instance_fields)
+
+    @overrides
+    def apply_token_indexers(self, instance: Instance) -> None:
+        instance.fields["tokens"]._token_indexers = self._token_indexers  # type: ignore
diff --git a/allennlp/data/dataset_readers/dataset_reader.py b/allennlp/data/dataset_readers/dataset_reader.py
index 591b7bf95cc..58614160b81 100644
--- a/allennlp/data/dataset_readers/dataset_reader.py
+++ b/allennlp/data/dataset_readers/dataset_reader.py
@@ -1,120 +1,84 @@
+from dataclasses import dataclass
 import itertools
-from typing import Iterable, Iterator, Optional, List, Any, Callable, Union
+from os import PathLike
+from typing import Iterable, Iterator, Optional, Union, TypeVar, Dict, List
 import logging
-import os
-from pathlib import Path
 import warnings
 
-from filelock import FileLock, Timeout
-import jsonpickle
 import torch.distributed as dist
-from torch.utils.data import Dataset, IterableDataset, get_worker_info
 
 from allennlp.data.instance import Instance
-from allennlp.data.vocabulary import Vocabulary
-from allennlp.common import Tqdm, util
-from allennlp.common.checks import ConfigurationError
-from allennlp.common.file_utils import CacheFile
+from allennlp.common import util
 from allennlp.common.registrable import Registrable
 
+
 logger = logging.getLogger(__name__)
 
 
-class AllennlpDataset(Dataset):
-    """
-    An `AllennlpDataset` is created by calling `.read()` on a non-lazy `DatasetReader`.
-    It's essentially just a thin wrapper around a list of instances.
+@dataclass
+class WorkerInfo:
     """
+    Contains information about the worker context when a `DatasetReader`
+    is being used within a multi-process `DataLoader`.
 
-    def __init__(self, instances: List[Instance], vocab: Vocabulary = None):
-        self.instances = instances
-        self.vocab = vocab
-
-    def __getitem__(self, idx) -> Instance:
-        if self.vocab is not None:
-            self.instances[idx].index_fields(self.vocab)
-        return self.instances[idx]
+    From a `DatasetReader` this can accessed with the [`get_worker_info()`](#get_worker_info) method.
+    """
 
-    def __len__(self):
-        return len(self.instances)
+    num_workers: int
+    """
+    The total number of workers.
+    """
 
-    def __iter__(self) -> Iterator[Instance]:
-        """
-        Even though it's not necessary to implement this because Python can infer
-        this method from `__len__` and `__getitem__`, this helps with type-checking
-        since `AllennlpDataset` can be considered an `Iterable[Instance]`.
-        """
-        yield from self.instances
+    id: int
+    """
+    The 0-indexed ID of the current worker.
+    """
 
-    def index_with(self, vocab: Vocabulary):
-        self.vocab = vocab
 
+@dataclass
+class DistributedInfo:
+    """
+    Contains information about the global process rank and total world size when the reader is being
+    used within distributed training.
 
-class AllennlpLazyDataset(IterableDataset):
+    From a `DatasetReader` this can be accessed with the [`get_distributed_info()`](#get_distributed_info) method.
     """
-    An `AllennlpLazyDataset` is created by calling `.read()` on a lazy `DatasetReader`.
 
-    # Parameters
+    world_size: int
+    """
+    The total number of processes in the distributed group.
+    """
 
-    instance_generator : `Callable[[str], Iterable[Instance]]`
-        A factory function that creates an iterable of `Instance`s from a file path.
-        This is usually just `DatasetReader._instance_iterator`.
-    file_path : `str`
-        The path to pass to the `instance_generator` function.
-    vocab : `Vocab`, optional (default = `None`)
-        An optional vocab. This can also be set later with the `.index_with` method.
+    global_rank: int
+    """
+    The 0-indexed ID of the current process within the distributed group.
+    This will be between 0 and `world_size - 1`, inclusive.
     """
 
-    def __init__(
-        self,
-        instance_generator: Callable[[str], Iterable[Instance]],
-        file_path: str,
-        vocab: Vocabulary = None,
-    ) -> None:
-        super().__init__()
-        self._instance_generator = instance_generator
-        self._file_path = file_path
-        self.vocab = vocab
-
-    def __iter__(self) -> Iterator[Instance]:
-        for instance in self._instance_generator(self._file_path):
-            if self.vocab is not None:
-                instance.index_fields(self.vocab)
-            yield instance
 
-    def index_with(self, vocab: Vocabulary):
-        self.vocab = vocab
+_T = TypeVar("_T")
+
+PathOrStr = Union[PathLike, str]
+DatasetReaderInput = Union[PathOrStr, List[PathOrStr], Dict[str, PathOrStr]]
 
 
 class DatasetReader(Registrable):
     """
     A `DatasetReader` knows how to turn a file containing a dataset into a collection
-    of `Instances`.  To implement your own, just override the `_read(file_path)` method
-    to return an `Iterable` of the instances. This could be a list containing the instances
-    or a lazy generator that returns them one at a time.
+    of `Instance`s.  To implement your own, just override the [`_read(file_path)`](#_read) method
+    to return an `Iterable` of the instances. Ideally this should be a lazy generator
+    that yields them one at a time.
 
     All parameters necessary to `_read` the data apart from the filepath should be passed
     to the constructor of the `DatasetReader`.
 
-    # Parameters
-
-    lazy : `bool`, optional (default=`False`)
-        If this is true, `instances()` will return an object whose `__iter__` method
-        reloads the dataset each time it's called. Otherwise, `instances()` returns a list.
+    You should also implement [`text_to_instance(*inputs)`](#text_to_instance),
+    which should be used to turn raw data into `Instance`s. This method is required
+    in order to use a `Predictor` with your reader.
 
-    cache_directory : `str`, optional (default=`None`)
-        If given, we will use this directory to store a cache of already-processed `Instances` in
-        every file passed to :func:`read`, serialized (by default, though you can override this) as
-        one string-formatted `Instance` per line.  If the cache file for a given `file_path` exists,
-        we read the `Instances` from the cache instead of re-processing the data (using
-        :func:`_instances_from_cache_file`).  If the cache file does _not_ exist, we will _create_
-        it on our first pass through the data (using :func:`_instances_to_cache_file`).
+    Usually the `_read()` method is implemented to call `text_to_instance()`.
 
-        !!! NOTE
-            It is the _caller's_ responsibility to make sure that this directory is
-            unique for any combination of code and parameters that you use.  That is, if you pass a
-            directory here, we will use any existing cache files in that directory _regardless of the
-            parameters you set for this DatasetReader!_
+    # Parameters
 
     max_instances : `int`, optional (default=`None`)
         If given, will stop reading after this many instances. This is a useful setting for debugging.
@@ -122,175 +86,130 @@ class DatasetReader(Registrable):
 
     manual_distributed_sharding: `bool`, optional (default=`False`)
         By default, when used in a distributed setting, `DatasetReader` makes sure that each
-        worker process only receives a subset of the data. It does this by reading the whole
-        dataset in each worker, but filtering out the instances that are not needed. If you
-        can implement a faster mechanism that only reads part of the data, set this to True,
-        and do the sharding yourself.
+        trainer process only receives a subset of the data. It does this by reading the whole
+        dataset in each worker, but filtering out the instances that are not needed.
+
+        While this ensures that each worker will recieve unique instances, it's not a very efficient
+        way to do so since each worker still needs to process every single instance.
 
-    manual_multi_process_sharding : `bool`, optional (default=`False`)
+        A better way to handle this is to manually handle the filtering within your `_read()`
+        method, in which case you should set `manual_distributed_sharding` to `True` so that
+        the base class knows that you handling the filtering.
+
+        See the section below about how to do this.
+
+    manual_multiprocess_sharding : `bool`, optional (default=`False`)
         This is similar to the `manual_distributed_sharding` parameter, but applies to
         multi-process data loading. By default, when this reader is used by a multi-process
         data loader (i.e. a `DataLoader` with `num_workers > 1`), each worker will
         filter out all but a subset of the instances that are needed so that you
         don't end up with duplicates.
 
-        !!! NOTE
-            **There is really no benefit of using a multi-process
-            `DataLoader` unless you can specifically implement a faster sharding mechanism
-            within `_read()`**. In that case you should set `manual_multi_process_sharding`
-            to `True`.
+        However, there is really no benefit to using multiple workers in your `DataLoader`
+        unless you implement the sharding within your `_read()` method, in which
+        case you should set `manual_multiprocess_sharding` to `True`, just as with
+        `manual_distributed_sharding`.
+
+        See the section below about how to do this.
 
     serialization_dir: `str`, optional (default=`None`)
         The directory in which the training output is saved to, or the directory the model is loaded from.
 
-    """
+        !!! Note
+            This is typically not given an entry in a configuration file. It will be set automatically
+            when using the built-in `allennp` commands.
+
+    # Using your reader with multi-process or distributed data loading
+
+    There are two things you may need to update in your `DatasetReader` in order for
+    it to be efficient in the multi-process or distributed data loading context.
+
+    1. The `_read()` method should handle filtering out all but the instances that
+        each particular worker should generate.
+
+        This is important because the default mechanism for filtering out `Instance`s in
+        the distributed or multi-process `DataLoader` setting is not very efficient, since every
+        worker would still need to process every single `Instance` in your dataset.
+
+        But by manually handling the filtering / sharding within your `_read()` method, each
+        worker only needs to perform a subset of the work required to create instances.
+
+        For example, if you were training using 2 GPUs and your `_read()` method reads a file
+        line-by-line, creating one `Instance` for each line, you could just check the node
+        rank within `_read()` and then throw away every other line starting at the line number
+        corresponding to the node rank.
+
+        The helper method [`shard_iterable()`](#shard_iterable) is there to make this easy for you.
+        You can wrap this around any iterable object in your `_read()` method, and it will
+        return an iterator that skips the right items based on the distributed training
+        or multi-process loading context. This method can always be called regardless
+        of whether or not you're actually using distributed training or multi-process loading.
+
+        Remember though that when you handle the sharding manually within `_read()`, you need
+        to let the `DatasetReader` know about this so that it doesn't do any additional
+        filtering. Therefore you need to ensure that both `self.manual_distributed_sharding` and
+        `self.manual_multiprocess_sharding` are set to `True`.
+
+        If you call the helper method `shard_iterable()` without setting these to `True`,
+        you'll get an exception.
+
+    2. If the instances generated by `_read()` contain `TextField`s, those `TextField`s
+        should not have any token indexers assigned. The token indexers need to be applied
+        in the [`apply_token_indexers()`](#apply_token_indexers) method instead.
+
+        This is highly recommended because if the instances generated by your `_read()` method
+        have token indexers attached, those indexers will be duplicated when they are sent across
+        processes. If your token indexers contain large objects (such as `PretrainedTransformerTokenIndexer`s)
+        this could take up a massive amount of memory.
 
-    CACHE_FILE_LOCK_TIMEOUT: int = 10
-    """
-    The number of seconds to wait for the lock on a cache file to become available.
     """
 
     def __init__(
         self,
-        lazy: bool = False,
-        cache_directory: Optional[str] = None,
         max_instances: Optional[int] = None,
         manual_distributed_sharding: bool = False,
-        manual_multi_process_sharding: bool = False,
+        manual_multiprocess_sharding: bool = False,
         serialization_dir: Optional[str] = None,
     ) -> None:
-        self.lazy = lazy
+        # Do some validation.
+        if max_instances is not None and max_instances < 0:
+            raise ValueError("If specified, max_instances should be a positive int")
+
         self.max_instances = max_instances
-        self._cache_directory: Optional[Path] = None
-        if cache_directory:
-            self._cache_directory = Path(cache_directory)
-            os.makedirs(self._cache_directory, exist_ok=True)
         self.manual_distributed_sharding = manual_distributed_sharding
-        self.manual_multi_process_sharding = manual_multi_process_sharding
+        self.manual_multiprocess_sharding = manual_multiprocess_sharding
         self.serialization_dir = serialization_dir
+        self._worker_info: Optional[WorkerInfo] = None
+        self._distributed_info: Optional[DistributedInfo] = None
+        # If we're actually in the main process, we can find the info using torch utils.
+        if util.is_distributed():
+            self._distributed_info = DistributedInfo(dist.get_world_size(), dist.get_rank())
 
-    def read(self, file_path: Union[Path, str]) -> Union[AllennlpDataset, AllennlpLazyDataset]:
+    def read(self, file_path: DatasetReaderInput) -> Iterator[Instance]:
         """
-        Returns an dataset containing all the instances that can be read from the file path.
-
-        If `self.lazy` is `False`, this eagerly reads all instances from `self._read()`
-        and returns an `AllennlpDataset`.
-
-        If `self.lazy` is `True`, this returns an `AllennlpLazyDataset`, which internally
-        relies on the generator created from `self._read()` to lazily produce `Instance`s.
-        In this case your implementation of `_read()` must also be lazy
-        (that is, not load all instances into memory at once), otherwise
-        you will get a `ConfigurationError`.
-
-        In either case, the returned `Iterable` can be iterated
-        over multiple times. It's unlikely you want to override this function,
-        but if you do your result should likewise be repeatedly iterable.
+        Returns an iterator of instances that can be read from the file path.
         """
-        if not isinstance(file_path, str):
-            file_path = str(file_path)
-
-        lazy = getattr(self, "lazy", None)
-
-        if lazy is None:
-            warnings.warn(
-                "DatasetReader.lazy is not set, "
-                "did you forget to call the superclass constructor?",
-                UserWarning,
-            )
-
-        if lazy:
-            return AllennlpLazyDataset(self._instance_iterator, file_path)
-        else:
-            cache_file: Optional[str] = None
-            if self._cache_directory:
-                cache_file = self._get_cache_location_for_file_path(file_path)
-
-            if cache_file is not None and os.path.exists(cache_file):
-                try:
-                    # Try to acquire a lock just to make sure another process isn't in the middle
-                    # of writing to the cache.
-                    cache_file_lock = FileLock(
-                        cache_file + ".lock", timeout=self.CACHE_FILE_LOCK_TIMEOUT
-                    )
-                    cache_file_lock.acquire()
-                    # We make an assumption here that if we can obtain the lock, no one will
-                    # be trying to write to the file anymore, so it should be safe to release the lock
-                    # before reading so that other processes can also read from it.
-                    cache_file_lock.release()
-                    logger.info("Reading instances from cache %s", cache_file)
-                    instances = self._instances_from_cache_file(cache_file)
-                except Timeout:
-                    logger.warning(
-                        "Failed to acquire lock on dataset cache file within %d seconds. "
-                        "Cannot use cache to read instances.",
-                        self.CACHE_FILE_LOCK_TIMEOUT,
-                    )
-                    instances = self._multi_worker_islice(self._read(file_path))
-            else:
-                instances = self._multi_worker_islice(self._read(file_path))
-
-            # Then some validation.
-            if not isinstance(instances, list):
-                instances = list(instances)
-
-            if not instances:
-                raise ConfigurationError(
-                    "No instances were read from the given filepath {}. "
-                    "Is the path correct?".format(file_path)
-                )
+        for instance in self._multi_worker_islice(self._read(file_path)):  # type: ignore
+            if self._worker_info is None:
+                # If not running in a subprocess, it's safe to apply the token_indexers right away.
+                self.apply_token_indexers(instance)
+            yield instance
 
-            # And finally we try writing to the cache.
-            if cache_file is not None and not os.path.exists(cache_file):
-                if self.max_instances is not None:
-                    # But we don't write to the cache when max_instances is specified.
-                    logger.warning(
-                        "Skipping writing to data cache since max_instances was specified."
-                    )
-                elif util.is_distributed() or (get_worker_info() and get_worker_info().num_workers):
-                    # We also shouldn't write to the cache if there's more than one process loading
-                    # instances since each worker only receives a partial share of the instances.
-                    logger.warning(
-                        "Can't cache data instances when there are multiple processes loading data"
-                    )
-                else:
-                    try:
-                        with FileLock(cache_file + ".lock", timeout=self.CACHE_FILE_LOCK_TIMEOUT):
-                            self._instances_to_cache_file(cache_file, instances)
-                    except Timeout:
-                        logger.warning(
-                            "Failed to acquire lock on dataset cache file within %d seconds. "
-                            "Cannot write to cache.",
-                            self.CACHE_FILE_LOCK_TIMEOUT,
-                        )
-
-            return AllennlpDataset(instances)
-
-    def _get_cache_location_for_file_path(self, file_path: str) -> str:
-        assert self._cache_directory is not None
-        return str(self._cache_directory / util.flatten_filename(str(file_path)))
-
-    def _read(self, file_path: str) -> Iterable[Instance]:
+    def _read(self, file_path) -> Iterable[Instance]:
         """
-        Reads the instances from the given file_path and returns them as an
-        `Iterable` (which could be a list or could be a generator).
-        You are strongly encouraged to use a generator, so that users can
+        Reads the instances from the given `file_path` and returns them as an
+        `Iterable`.
+
+        You are strongly encouraged to use a generator so that users can
         read a dataset in a lazy way, if they so choose.
         """
+        # NOTE: `file_path` is left untyped here on purpose.
+        # Technically the type should be `DatasetReaderInput`, but many subclass
+        # implementations of `DatasetReader` define their `_read()` method to take a more
+        # specific type, such as just `str`. But that would be a type error
+        # according to mypy: https://mypy.readthedocs.io/en/stable/common_issues.html#incompatible-overrides
         raise NotImplementedError
 
-    def _instances_from_cache_file(self, cache_filename: str) -> Iterable[Instance]:
-        with open(cache_filename, "r") as cache_file:
-            yield from self._multi_worker_islice(cache_file, self.deserialize_instance)
-
-    def _instances_to_cache_file(self, cache_filename, instances) -> None:
-        # We serialize to a temp file first in case anything goes wrong while
-        # writing to cache (e.g., the computer shuts down unexpectedly).
-        # Then we just copy the file over to `cache_filename`.
-        with CacheFile(cache_filename, mode="w+") as cache_handle:
-            logger.info("Caching instances to temp file %s", cache_handle.name)
-            for instance in Tqdm.tqdm(instances, desc="caching instances"):
-                cache_handle.write(self.serialize_instance(instance) + "\n")
-
     def text_to_instance(self, *inputs) -> Instance:
         """
         Does whatever tokenization or processing is necessary to go from textual input to an
@@ -310,143 +229,149 @@ def text_to_instance(self, *inputs) -> Instance:
         """
         raise NotImplementedError
 
-    def serialize_instance(self, instance: Instance) -> str:
+    def apply_token_indexers(self, instance: Instance) -> None:
         """
-        Serializes an `Instance` to a string.  We use this for caching the processed data.
-
-        The default implementation is to use `jsonpickle`.  If you would like some other format
-        for your pre-processed data, override this method.
+        If `Instance`s created by this reader contain `TextField`s without `token_indexers`,
+        this method can be overriden to set the `token_indexers` of those fields.
         """
-        return jsonpickle.dumps(instance)
+        pass
 
-    def deserialize_instance(self, string: str) -> Instance:
+    def get_worker_info(self) -> Optional[WorkerInfo]:
         """
-        Deserializes an `Instance` from a string.  We use this when reading processed data from a
-        cache.
+        Provides a [`WorkerInfo`](#WorkerInfo) object when the reader is being used within a
+        worker of a multi-process `DataLoader`.
+
+        If the reader is in the main process, this is just `None`.
+
+        !!! NOTE
+            This is different than distributed training. If the `DatasetReader`
+            is being used within distributed training, `get_worker_info()` will only
+            provide information on the `DataLoader` worker within its node.
+
+            Use [`get_distributed_info`](#get_distributed_info) to get information on distributed
+            training context.
 
-        The default implementation is to use `jsonpickle`.  If you would like some other format
-        for your pre-processed data, override this method.
         """
-        return jsonpickle.loads(string.strip())  # type: ignore
+        return self._worker_info
 
-    def _multi_worker_islice(
-        self,
-        iterable: Iterable[Any],
-        transform: Optional[Callable[[Any], Instance]] = None,
-        ensure_lazy: bool = False,
-    ) -> Iterable[Instance]:
+    def get_distributed_info(self) -> Optional[DistributedInfo]:
         """
-        Helper method that determines which raw instances to skip based on the current
-        node rank (for distributed training) and worker ID (for multi-process data loading).
+        Provides a [`DistributedInfo`](#DistributedInfo) object when the reader is being
+        used within distributed training.
 
-        # Parameters
+        If not in distributed training, this is just `None`.
+        """
+        return self._distributed_info
 
-        iterable : `Iterable[Any]`
-            An iterable that yields raw data that can be transformed into `Instance`s
-            through the `transform` function.
-        transform : `Optional[Callable[[Any], Instance]]`, optional (default = `None`)
-            An optional function that will be applied to the raw data generated
-            by `iterable` to create `Instance`s. This is used, e.g., when reading
-            cached data.
-        ensure_lazy : `bool`, optional (default = `False`)
-            If `True`, a `ConfigurationError` error will be raised if `iterable`
-            is a list instead of a lazy generator type.
+    def _set_worker_info(self, info: Optional[WorkerInfo]) -> None:
+        """
+        Should only be used internally.
+        """
+        self._worker_info = info
 
-        # Returns
+    def _set_distributed_info(self, info: Optional[DistributedInfo]) -> None:
+        """
+        Should only be used internally.
+        """
+        self._distributed_info = info
 
-        `Iterable[Instance]`
+    def shard_iterable(self, iterable: Iterable[_T]) -> Iterator[_T]:
+        """
+        Helper method that determines which items in an iterable object to skip based
+        on the current node rank (for distributed training) and worker ID (for multi-process data loading).
         """
-        if ensure_lazy and isinstance(iterable, (list, tuple)):
-            raise ConfigurationError("For a lazy dataset reader, _read() must return a generator")
-
-        wrap_with_tqdm = True
-        start_index = 0
-        step_size = 1
-        if not self.manual_distributed_sharding and util.is_distributed():
-            start_index = dist.get_rank()
-            step_size = dist.get_world_size()
-        worker_info = None if self.manual_multi_process_sharding else get_worker_info()
-        if worker_info:
-            warnings.warn(
-                "Using multi-process data loading without setting "
-                "DatasetReader.manual_multi_process_sharding to True.\n"
-                "Did you forget to set this?\n"
-                "If you're not handling the multi-process sharding logic within your "
-                "_read() method, there is probably no benefit to using more than one "
-                "worker.",
-                UserWarning,
+        if not self.manual_distributed_sharding or not self.manual_multiprocess_sharding:
+            raise ValueError(
+                "self.shard_iterable() was called but self.manual_distributed_sharding and "
+                "self.manual_multiprocess_sharding was not set to True. Did you forget to call "
+                "super().__init__(manual_distributed_sharding=True, manual_multiprocess_sharding=True) "
+                "in your constructor?"
+            )
+
+        sharded_slice: Iterator[_T] = iter(iterable)
+
+        if util.is_distributed():
+            sharded_slice = itertools.islice(
+                sharded_slice, dist.get_rank(), None, dist.get_world_size()
             )
-            # Scale `start_index` by `num_workers`, then shift by worker `id`.
-            start_index *= worker_info.num_workers
-            start_index += worker_info.id
-            # Scale `step_size` by `num_workers`.
-            step_size *= worker_info.num_workers
-            if worker_info.id > 0:
-                # We only want to log with tqdm from the main loader process.
-                wrap_with_tqdm = False
-
-        islice = itertools.islice(iterable, start_index, self.max_instances, step_size)
-        if wrap_with_tqdm:
-            islice = Tqdm.tqdm(islice, desc="reading instances")
-
-        if transform is not None:
-            return (transform(x) for x in islice)
-        return islice
-
-    def _instance_iterator(self, file_path: str) -> Iterable[Instance]:
-        cache_file: Optional[str] = None
-        if self._cache_directory:
-            cache_file = self._get_cache_location_for_file_path(file_path)
-
-        if cache_file is not None and os.path.exists(cache_file):
-            cache_file_lock = FileLock(cache_file + ".lock", timeout=self.CACHE_FILE_LOCK_TIMEOUT)
-            try:
-                cache_file_lock.acquire()
-                # We make an assumption here that if we can obtain the lock, no one will
-                # be trying to write to the file anymore, so it should be safe to release the lock
-                # before reading so that other processes can also read from it.
-                cache_file_lock.release()
-                logger.info("Reading instances from cache %s", cache_file)
-                with open(cache_file) as data_file:
-                    yield from self._multi_worker_islice(
-                        data_file, transform=self.deserialize_instance
-                    )
-            except Timeout:
-                logger.warning(
-                    "Failed to acquire lock on dataset cache file within %d seconds. "
-                    "Cannot use cache to read instances.",
-                    self.CACHE_FILE_LOCK_TIMEOUT,
+
+        if self._worker_info is not None:
+            sharded_slice = itertools.islice(
+                sharded_slice, self._worker_info.id, None, self._worker_info.num_workers
+            )
+
+        # We don't know for sure how many instances we have to produce.
+        # _multi_worker_islice() figures that out. But we know for sure
+        # it won't be more than max_instances.
+        if self.max_instances is not None:
+            sharded_slice = itertools.islice(sharded_slice, self.max_instances)
+
+        return sharded_slice
+
+    def _multi_worker_islice(
+        self,
+        iterable: Iterable[_T],
+    ) -> Iterator[_T]:
+        """
+        This is just like `shard_iterable` but is for internal use only.
+
+        It has some additional logic to handle `max_instances` based on the distributed
+        or multi-process context, and whether or not sharding is handled manually
+        in the `_read()` method.
+        """
+        # This has some complicated logic because any given reader may or may not
+        # implement manual multi-process and manual distributed sharding itself.
+        # We have to handle all possibilities.
+
+        sharded_slice: Iterator[_T] = iter(iterable)
+
+        # We'll adjust max_instances as we go, depending on what sort of sharding is done.
+        # At the end, we want to ensure the total number of instances collected across
+        # all workers processes is equal to self.max_instances.
+        max_instances = self.max_instances
+
+        if self._distributed_info is not None:
+            if max_instances is not None:
+                # Need to scale down max_instances because otherwise each node would read self.max_instances,
+                # but we really want self.max_instances total across all nodes.
+                if self._distributed_info.global_rank < (
+                    max_instances % self._distributed_info.world_size
+                ):
+                    max_instances = max_instances // self._distributed_info.world_size + 1
+                else:
+                    max_instances = max_instances // self._distributed_info.world_size
+
+            if not self.manual_distributed_sharding:
+                sharded_slice = itertools.islice(
+                    sharded_slice,
+                    self._distributed_info.global_rank,
+                    None,
+                    self._distributed_info.world_size,
                 )
-                yield from self._multi_worker_islice(self._read(file_path), ensure_lazy=True)
-        elif cache_file is not None and not os.path.exists(cache_file):
-            instances = self._multi_worker_islice(self._read(file_path), ensure_lazy=True)
-            # The cache file doesn't exist so we'll try writing to it.
-            if self.max_instances is not None:
-                # But we don't write to the cache when max_instances is specified.
-                logger.warning("Skipping writing to data cache since max_instances was specified.")
-                yield from instances
-            elif util.is_distributed() or (get_worker_info() and get_worker_info().num_workers):
-                # We also shouldn't write to the cache if there's more than one process loading
-                # instances since each worker only receives a partial share of the instances.
-                logger.warning(
-                    "Can't cache data instances when there are multiple processes loading data"
+
+        if self._worker_info is not None:
+            if max_instances is not None:
+                # Like in the distributed case above, we need to adjust max_instances.
+                if self._worker_info.id < (max_instances % self._worker_info.num_workers):
+                    max_instances = max_instances // self._worker_info.num_workers + 1
+                else:
+                    max_instances = max_instances // self._worker_info.num_workers
+
+            if not self.manual_multiprocess_sharding:
+                warnings.warn(
+                    "Using multi-process data loading without setting "
+                    "DatasetReader.manual_multiprocess_sharding to True.\n"
+                    "Did you forget to set this?\n"
+                    "If you're not handling the multi-process sharding logic within your "
+                    "_read() method, there is probably no benefit to using more than one "
+                    "worker.",
+                    UserWarning,
+                )
+                sharded_slice = itertools.islice(
+                    sharded_slice, self._worker_info.id, None, self._worker_info.num_workers
                 )
-                yield from instances
-            else:
-                try:
-                    with FileLock(cache_file + ".lock", timeout=self.CACHE_FILE_LOCK_TIMEOUT):
-                        with CacheFile(cache_file, mode="w+") as cache_handle:
-                            logger.info("Caching instances to temp file %s", cache_handle.name)
-                            for instance in instances:
-                                cache_handle.write(self.serialize_instance(instance) + "\n")
-                                yield instance
-                except Timeout:
-                    logger.warning(
-                        "Failed to acquire lock on dataset cache file within %d seconds. "
-                        "Cannot write to cache.",
-                        self.CACHE_FILE_LOCK_TIMEOUT,
-                    )
-                    yield from instances
-        else:
-            # No cache.
-            yield from self._multi_worker_islice(self._read(file_path), ensure_lazy=True)
+
+        if max_instances is not None:
+            sharded_slice = itertools.islice(sharded_slice, max_instances)
+
+        return sharded_slice
diff --git a/allennlp/data/dataset_readers/interleaving_dataset_reader.py b/allennlp/data/dataset_readers/interleaving_dataset_reader.py
index 0be1ab84a17..0f592aa5c3f 100644
--- a/allennlp/data/dataset_readers/interleaving_dataset_reader.py
+++ b/allennlp/data/dataset_readers/interleaving_dataset_reader.py
@@ -1,8 +1,8 @@
-from typing import Dict, Mapping, Iterable
+from typing import Dict, Mapping, Iterable, Union
 import json
 
 from allennlp.common.checks import ConfigurationError
-from allennlp.data.dataset_readers.dataset_reader import DatasetReader
+from allennlp.data.dataset_readers.dataset_reader import DatasetReader, PathOrStr
 from allennlp.data.fields import MetadataField
 from allennlp.data.instance import Instance
 
@@ -72,14 +72,17 @@ def _read_all_at_once(self, datasets: Mapping[str, Iterable[Instance]]) -> Itera
                 instance.fields[self._dataset_field_name] = MetadataField(key)
                 yield instance
 
-    def _read(self, file_path: str) -> Iterable[Instance]:
-        try:
-            file_paths = json.loads(file_path)
-        except json.JSONDecodeError:
-            raise ConfigurationError(
-                "the file_path for the InterleavingDatasetReader "
-                "needs to be a JSON-serialized dictionary {reader_name -> file_path}"
-            )
+    def _read(self, file_path: Union[str, Dict[str, PathOrStr]]) -> Iterable[Instance]:
+        if isinstance(file_path, str):
+            try:
+                file_paths = json.loads(file_path)
+            except json.JSONDecodeError:
+                raise ConfigurationError(
+                    "the file_path for the InterleavingDatasetReader "
+                    "needs to be a JSON-serialized dictionary {reader_name -> file_path}"
+                )
+        else:
+            file_paths = file_path
 
         if file_paths.keys() != self._readers.keys():
             raise ConfigurationError("mismatched keys")
diff --git a/allennlp/data/dataset_readers/multitask.py b/allennlp/data/dataset_readers/multitask.py
new file mode 100644
index 00000000000..be529b884e4
--- /dev/null
+++ b/allennlp/data/dataset_readers/multitask.py
@@ -0,0 +1,31 @@
+from os import PathLike
+from typing import Dict, Iterator, Union
+
+from allennlp.data.instance import Instance
+from allennlp.data.dataset_readers.dataset_reader import DatasetReader
+
+
+@DatasetReader.register("multitask")
+class MultiTaskDatasetReader(DatasetReader):
+    """
+    This `DatasetReader` simply collects a dictionary of other `DatasetReaders`.  It is designed for
+    a different class (the `MultiTaskDataLoader`) to actually read from each of the underlying
+    dataset readers, and so this really is just a glorified dictionary that we can construct as a
+    `DatasetReader`.  We throw an error if you try to actually call `read()`, because you should be
+    doing that differently.
+
+    Registered as a `DatasetReader` with name "multitask".
+
+    # Parameters
+
+    readers : `Dict[str, DatasetReader]`
+        A mapping from dataset name to `DatasetReader` objects for reading that dataset.  You can
+        use whatever names you want for the datasets, but they have to match the keys you use for
+        data files and in other places in the `MultiTaskDataLoader` and `MultiTaskScheduler`.
+    """
+
+    def __init__(self, readers: Dict[str, DatasetReader]) -> None:
+        self.readers = readers
+
+    def read(self, file_paths: Dict[str, Union[PathLike, str]]) -> Dict[str, Iterator[Instance]]:  # type: ignore
+        raise RuntimeError("This class is not designed to be called like this")
diff --git a/allennlp/data/dataset_readers/sequence_tagging.py b/allennlp/data/dataset_readers/sequence_tagging.py
index 63b3442d1a6..40f03a5d6de 100644
--- a/allennlp/data/dataset_readers/sequence_tagging.py
+++ b/allennlp/data/dataset_readers/sequence_tagging.py
@@ -49,7 +49,9 @@ def __init__(
         token_indexers: Dict[str, TokenIndexer] = None,
         **kwargs,
     ) -> None:
-        super().__init__(**kwargs)
+        super().__init__(
+            manual_distributed_sharding=True, manual_multiprocess_sharding=True, **kwargs
+        )
         self._token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer()}
         self._word_tag_delimiter = word_tag_delimiter
         self._token_delimiter = token_delimiter
@@ -60,9 +62,8 @@ def _read(self, file_path):
         file_path = cached_path(file_path)
 
         with open(file_path, "r") as data_file:
-
             logger.info("Reading instances from lines in file at: %s", file_path)
-            for line in data_file:
+            for line in self.shard_iterable(data_file):
                 line = line.strip("\n")
 
                 # skip blank lines
@@ -85,9 +86,13 @@ def text_to_instance(  # type: ignore
         """
 
         fields: Dict[str, Field] = {}
-        sequence = TextField(tokens, self._token_indexers)
+        sequence = TextField(tokens)
         fields["tokens"] = sequence
         fields["metadata"] = MetadataField({"words": [x.text for x in tokens]})
         if tags is not None:
             fields["tags"] = SequenceLabelField(tags, sequence)
         return Instance(fields)
+
+    @overrides
+    def apply_token_indexers(self, instance: Instance) -> None:
+        instance.fields["tokens"]._token_indexers = self._token_indexers  # type: ignore
diff --git a/allennlp/data/dataset_readers/sharded_dataset_reader.py b/allennlp/data/dataset_readers/sharded_dataset_reader.py
index c4513cdd596..2976bb332eb 100644
--- a/allennlp/data/dataset_readers/sharded_dataset_reader.py
+++ b/allennlp/data/dataset_readers/sharded_dataset_reader.py
@@ -1,13 +1,11 @@
 import glob
 import logging
 import os
-import torch
 from typing import Iterable
 
-from allennlp.common import util
 from allennlp.common.checks import ConfigurationError
 from allennlp.common.file_utils import cached_path
-from allennlp.data.dataset_readers.dataset_reader import DatasetReader
+from allennlp.data.dataset_readers.dataset_reader import DatasetReader, PathOrStr
 from allennlp.data.instance import Instance
 
 
@@ -26,14 +24,12 @@ class ShardedDatasetReader(DatasetReader):
     files within the archive.
 
     The order the files are processed in is deterministic to enable the
-    instances to be filtered according to worker rank in the distributed case.
+    instances to be filtered according to worker rank in the distributed training or multi-process
+    data loading scenarios. In either case, the number of file shards should ideally be a multiple
+    of the number of workers, and each file should produce roughly the same number of instances.
 
     Registered as a `DatasetReader` with name "sharded".
 
-    This class accepts all additional parameters of any `DatasetReader` class via `**kwargs`.
-    We give priority to the values set in the constructor for the instance of this class.
-    Optionally, we will automatically inherit attributes from the `base_reader` when required.
-
     # Parameters
 
     base_reader : `DatasetReader`
@@ -41,33 +37,14 @@ class ShardedDatasetReader(DatasetReader):
     """
 
     def __init__(self, base_reader: DatasetReader, **kwargs) -> None:
-        # ShardedDatasetReader is a wrapper for the original base_reader so some of the parameters like 'lazy'
-        # can be safely inherited. However, ShardedDatasetReader is a class instance of a DatasetReader as well.
-        # So we give priority to the parameters for the current instance stored in 'kwargs'.
-        # If not present, we check the ones in the base reader
-        kwargs["lazy"] = kwargs.get("lazy", base_reader.lazy)
-
-        super().__init__(manual_distributed_sharding=True, **kwargs)
-
-        if util.is_distributed():
-            self._rank = torch.distributed.get_rank()
-            self._world_size = torch.distributed.get_world_size()
-        else:
-            self._rank = 0
-            self._world_size = 1
-
+        super().__init__(
+            manual_distributed_sharding=True, manual_multiprocess_sharding=True, **kwargs
+        )
         self.reader = base_reader
-        # We have to check that the base reader doesn't implement manual distributed
-        # sharding itself, because if it does, then only a fraction of the instances
-        # will be read.
-        if getattr(self.reader, "manual_distributed_sharding", False):
-            raise ValueError(
-                "The base reader of a sharded dataset reader should not implement "
-                "manual distributed sharding itself."
-            )
-        # However we still need to set this flag to `True` after the fact so that
-        # all of the instances within each shard are used.
-        self.reader.manual_distributed_sharding = True
+        # We have to make the base reader think that it's the only worker so that it doesn't
+        # do any of its own filtering.
+        self.reader._set_worker_info(None)
+        self.reader._set_distributed_info(None)
 
     def text_to_instance(self, *args, **kwargs) -> Instance:
         """
@@ -75,7 +52,7 @@ def text_to_instance(self, *args, **kwargs) -> Instance:
         """
         return self.reader.text_to_instance(*args, **kwargs)  # type: ignore
 
-    def _read(self, file_path: str) -> Iterable[Instance]:
+    def _read(self, file_path: PathOrStr) -> Iterable[Instance]:
         try:
             maybe_extracted_archive = cached_path(file_path, extract_archive=True)
             if not os.path.isdir(maybe_extracted_archive):
@@ -90,15 +67,14 @@ def _read(self, file_path: str) -> Iterable[Instance]:
                 raise ConfigurationError(f"No files found in {file_path}")
         except FileNotFoundError:
             # Not a local or remote archive, so treat as a glob.
-            shards = glob.glob(file_path)
+            shards = glob.glob(str(file_path))
             if not shards:
                 raise ConfigurationError(f"No files found matching {file_path}")
 
         # Ensure a consistent order.
         shards.sort()
 
-        for i, shard in enumerate(shards):
-            if i % self._world_size == self._rank:
-                logger.info(f"reading instances from {shard}")
-                for instance in self.reader.read(shard):
-                    yield instance
+        for shard in self.shard_iterable(shards):
+            logger.info(f"reading instances from {shard}")
+            for instance in self.reader.read(shard):
+                yield instance
diff --git a/allennlp/data/dataset_readers/text_classification_json.py b/allennlp/data/dataset_readers/text_classification_json.py
index ce16e2d0ddf..81d1a80ebfc 100644
--- a/allennlp/data/dataset_readers/text_classification_json.py
+++ b/allennlp/data/dataset_readers/text_classification_json.py
@@ -55,7 +55,9 @@ def __init__(
         skip_label_indexing: bool = False,
         **kwargs,
     ) -> None:
-        super().__init__(**kwargs)
+        super().__init__(
+            manual_distributed_sharding=True, manual_multiprocess_sharding=True, **kwargs
+        )
         self._tokenizer = tokenizer or SpacyTokenizer()
         self._segment_sentences = segment_sentences
         self._max_sequence_length = max_sequence_length
@@ -67,7 +69,7 @@ def __init__(
     @overrides
     def _read(self, file_path):
         with open(cached_path(file_path), "r") as data_file:
-            for line in data_file.readlines():
+            for line in self.shard_iterable(data_file.readlines()):
                 if not line:
                     continue
                 items = json.loads(line)
@@ -83,9 +85,7 @@ def _read(self, file_path):
                             )
                     else:
                         label = str(label)
-                instance = self.text_to_instance(text=text, label=label)
-                if instance is not None:
-                    yield instance
+                yield self.text_to_instance(text=text, label=label)
 
     def _truncate(self, tokens):
         """
@@ -124,13 +124,21 @@ def text_to_instance(
                 word_tokens = self._tokenizer.tokenize(sentence)
                 if self._max_sequence_length is not None:
                     word_tokens = self._truncate(word_tokens)
-                sentences.append(TextField(word_tokens, self._token_indexers))
+                sentences.append(TextField(word_tokens))
             fields["tokens"] = ListField(sentences)
         else:
             tokens = self._tokenizer.tokenize(text)
             if self._max_sequence_length is not None:
                 tokens = self._truncate(tokens)
-            fields["tokens"] = TextField(tokens, self._token_indexers)
+            fields["tokens"] = TextField(tokens)
         if label is not None:
             fields["label"] = LabelField(label, skip_indexing=self._skip_label_indexing)
         return Instance(fields)
+
+    @overrides
+    def apply_token_indexers(self, instance: Instance) -> None:
+        if self._segment_sentences:
+            for text_field in instance.fields["tokens"]:  # type: ignore
+                text_field._token_indexers = self._token_indexers
+        else:
+            instance.fields["tokens"]._token_indexers = self._token_indexers  # type: ignore
diff --git a/allennlp/data/fields/__init__.py b/allennlp/data/fields/__init__.py
index 6b6a706bed0..fa01eac7367 100644
--- a/allennlp/data/fields/__init__.py
+++ b/allennlp/data/fields/__init__.py
@@ -5,7 +5,7 @@
 
 from allennlp.data.fields.field import Field
 from allennlp.data.fields.adjacency_field import AdjacencyField
-from allennlp.data.fields.array_field import ArrayField
+from allennlp.data.fields.tensor_field import TensorField
 from allennlp.data.fields.flag_field import FlagField
 from allennlp.data.fields.index_field import IndexField
 from allennlp.data.fields.label_field import LabelField
@@ -17,3 +17,4 @@
 from allennlp.data.fields.sequence_label_field import SequenceLabelField
 from allennlp.data.fields.span_field import SpanField
 from allennlp.data.fields.text_field import TextField
+from allennlp.data.fields.array_field import ArrayField
diff --git a/allennlp/data/fields/array_field.py b/allennlp/data/fields/array_field.py
index 4b33ca01b2a..0a69968c359 100644
--- a/allennlp/data/fields/array_field.py
+++ b/allennlp/data/fields/array_field.py
@@ -1,68 +1,4 @@
-from typing import Dict
+from allennlp.data.fields.tensor_field import TensorField
 
-import numpy
-import torch
-from overrides import overrides
-
-from allennlp.data.fields.field import Field
-
-
-class ArrayField(Field[numpy.ndarray]):
-    """
-    A class representing an array, which could have arbitrary dimensions.
-    A batch of these arrays are padded to the max dimension length in the batch
-    for each dimension.
-    """
-
-    __slots__ = ["array", "padding_value", "dtype"]
-
-    def __init__(
-        self, array: numpy.ndarray, padding_value: int = 0, dtype: numpy.dtype = numpy.float32
-    ) -> None:
-        self.array = array
-        self.padding_value = padding_value
-        self.dtype = dtype
-
-    @overrides
-    def get_padding_lengths(self) -> Dict[str, int]:
-        return {"dimension_" + str(i): shape for i, shape in enumerate(self.array.shape)}
-
-    @overrides
-    def as_tensor(self, padding_lengths: Dict[str, int]) -> torch.Tensor:
-        max_shape = [padding_lengths["dimension_{}".format(i)] for i in range(len(padding_lengths))]
-
-        # Convert explicitly to an ndarray just in case it's an scalar
-        # (it'd end up not being an ndarray otherwise).
-        # Also, the explicit dtype declaration for `asarray` is necessary for scalars.
-        return_array = numpy.asarray(
-            numpy.ones(max_shape, dtype=self.dtype) * self.padding_value, dtype=self.dtype
-        )
-
-        # If the tensor has a different shape from the largest tensor, pad dimensions with zeros to
-        # form the right shaped list of slices for insertion into the final tensor.
-        slicing_shape = list(self.array.shape)
-        if len(self.array.shape) < len(max_shape):
-            slicing_shape = slicing_shape + [
-                0 for _ in range(len(max_shape) - len(self.array.shape))
-            ]
-        slices = tuple([slice(0, x) for x in slicing_shape])
-        return_array[slices] = self.array
-        tensor = torch.from_numpy(return_array)
-        return tensor
-
-    @overrides
-    def empty_field(self):
-        # Pass the padding_value, so that any outer field, e.g., `ListField[ArrayField]` uses the
-        # same padding_value in the padded ArrayFields
-        return ArrayField(
-            numpy.array([], dtype=self.dtype), padding_value=self.padding_value, dtype=self.dtype
-        )
-
-    def __str__(self) -> str:
-        return f"ArrayField with shape: {self.array.shape} and dtype: {self.dtype}."
-
-    def __len__(self):
-        return 1 if self.array.ndim == 0 else self.array.shape[0]
-
-    def __eq__(self, other) -> bool:
-        return numpy.array_equal(self.array, other.array)
+ArrayField = TensorField
+"""For backwards compatibility, we keep the name `ArrayField`."""
diff --git a/allennlp/data/fields/list_field.py b/allennlp/data/fields/list_field.py
index b1d409b7e20..0a77a75a5d8 100644
--- a/allennlp/data/fields/list_field.py
+++ b/allennlp/data/fields/list_field.py
@@ -1,4 +1,4 @@
-from typing import Dict, List, Iterator
+from typing import Dict, List, Iterator, Sequence
 
 from overrides import overrides
 
@@ -26,13 +26,13 @@ class ListField(SequenceField[DataArray]):
 
     __slots__ = ["field_list"]
 
-    def __init__(self, field_list: List[Field]) -> None:
+    def __init__(self, field_list: Sequence[Field]) -> None:
         field_class_set = {field.__class__ for field in field_list}
         assert (
             len(field_class_set) == 1
         ), "ListFields must contain a single field type, found " + str(field_class_set)
         # Not sure why mypy has a hard time with this type...
-        self.field_list: List[Field] = field_list
+        self.field_list = field_list
 
     # Sequence[Field] methods
     def __iter__(self) -> Iterator[Field]:
diff --git a/allennlp/data/fields/tensor_field.py b/allennlp/data/fields/tensor_field.py
new file mode 100644
index 00000000000..02091df7a5e
--- /dev/null
+++ b/allennlp/data/fields/tensor_field.py
@@ -0,0 +1,78 @@
+from typing import Dict, Any, Union, Optional
+
+import torch
+import numpy as np
+from overrides import overrides
+
+from allennlp.data.fields.field import Field
+
+
+class TensorField(Field[torch.Tensor]):
+    """
+    A class representing a tensor, which could have arbitrary dimensions.
+    A batch of these tensors are padded to the max dimension length in the batch
+    for each dimension.
+    """
+
+    __slots__ = ["tensor", "padding_value"]
+
+    def __init__(
+        self,
+        tensor: Union[torch.Tensor, np.ndarray],
+        padding_value: Any = 0.0,
+        dtype: Optional[Union[np.dtype, torch.dtype]] = None,
+    ) -> None:
+        if dtype is not None:
+            if isinstance(tensor, np.ndarray):
+                tensor = tensor.astype(dtype)
+            elif isinstance(tensor, torch.Tensor):
+                tensor = tensor.to(dtype)
+            else:
+                raise ValueError("Did not recognize the type of `tensor`.")
+        if isinstance(tensor, np.ndarray):
+            tensor = torch.from_numpy(tensor)
+
+        self.tensor = tensor.cpu()
+        self.padding_value = padding_value
+
+    @overrides
+    def get_padding_lengths(self) -> Dict[str, int]:
+        return {"dimension_" + str(i): shape for i, shape in enumerate(self.tensor.size())}
+
+    @overrides
+    def as_tensor(self, padding_lengths: Dict[str, int]) -> torch.Tensor:
+        tensor = self.tensor
+        while len(tensor.size()) < len(padding_lengths):
+            tensor = tensor.unsqueeze(-1)
+        pad = [
+            padding
+            for i, dimension_size in reversed(list(enumerate(tensor.size())))
+            for padding in [0, padding_lengths["dimension_" + str(i)] - dimension_size]
+        ]
+        return torch.nn.functional.pad(tensor, pad, value=self.padding_value)
+
+    @overrides
+    def empty_field(self):
+        # Pass the padding_value, so that any outer field, e.g., `ListField[TensorField]` uses the
+        # same padding_value in the padded ArrayFields
+        return TensorField(
+            torch.tensor([], dtype=self.tensor.dtype), padding_value=self.padding_value
+        )
+
+    def __str__(self) -> str:
+        return f"TensorField with shape: {self.tensor.size()} and dtype: {self.tensor.dtype}."
+
+    def __len__(self):
+        return 1 if len(self.tensor.size()) <= 0 else self.tensor.size(0)
+
+    def __eq__(self, other) -> bool:
+        if isinstance(self, other.__class__):
+            return (
+                torch.equal(self.tensor, other.tensor) and self.padding_value == other.padding_value
+            )
+        return NotImplemented
+
+    @property
+    def array(self):
+        """This is a compatibility method that returns the underlying tensor as a numpy array."""
+        return self.tensor.numpy()
diff --git a/allennlp/data/fields/text_field.py b/allennlp/data/fields/text_field.py
index 19f79078f6d..9d171223fd6 100644
--- a/allennlp/data/fields/text_field.py
+++ b/allennlp/data/fields/text_field.py
@@ -44,7 +44,9 @@ class TextField(SequenceField[TextFieldTensors]):
 
     __slots__ = ["tokens", "_token_indexers", "_indexed_tokens"]
 
-    def __init__(self, tokens: List[Token], token_indexers: Dict[str, TokenIndexer]) -> None:
+    def __init__(
+        self, tokens: List[Token], token_indexers: Optional[Dict[str, TokenIndexer]] = None
+    ) -> None:
         self.tokens = tokens
         self._token_indexers = token_indexers
         self._indexed_tokens: Optional[Dict[str, IndexedTokenList]] = None
@@ -55,16 +57,32 @@ def __init__(self, tokens: List[Token], token_indexers: Dict[str, TokenIndexer])
                 "Found: {} with types {}.".format(tokens, [type(x) for x in tokens])
             )
 
+    @property
+    def token_indexers(self) -> Dict[str, TokenIndexer]:
+        if self._token_indexers is None:
+            raise ValueError(
+                "TextField's token_indexers have not been set.\n"
+                "Did you forget to call DatasetReader.apply_token_indexers(instance) "
+                "on your instance?\n"
+                "If apply_token_indexers() is being called but "
+                "you're still seeing this error, it may not be implemented correctly."
+            )
+        return self._token_indexers
+
+    @token_indexers.setter
+    def token_indexers(self, token_indexers: Dict[str, TokenIndexer]) -> None:
+        self._token_indexers = token_indexers
+
     @overrides
     def count_vocab_items(self, counter: Dict[str, Dict[str, int]]):
-        for indexer in self._token_indexers.values():
+        for indexer in self.token_indexers.values():
             for token in self.tokens:
                 indexer.count_vocab_items(token, counter)
 
     @overrides
     def index(self, vocab: Vocabulary):
         self._indexed_tokens = {}
-        for indexer_name, indexer in self._token_indexers.items():
+        for indexer_name, indexer in self.token_indexers.items():
             self._indexed_tokens[indexer_name] = indexer.tokens_to_indices(self.tokens, vocab)
 
     @overrides
@@ -80,7 +98,7 @@ def get_padding_lengths(self) -> Dict[str, int]:
             )
 
         padding_lengths = {}
-        for indexer_name, indexer in self._token_indexers.items():
+        for indexer_name, indexer in self.token_indexers.items():
             indexer_lengths = indexer.get_padding_lengths(self._indexed_tokens[indexer_name])
             for key, length in indexer_lengths.items():
                 padding_lengths[f"{indexer_name}___{key}"] = length
@@ -106,7 +124,7 @@ def as_tensor(self, padding_lengths: Dict[str, int]) -> TextFieldTensors:
             indexer_name, padding_key = key.split("___")
             indexer_lengths[indexer_name][padding_key] = value
 
-        for indexer_name, indexer in self._token_indexers.items():
+        for indexer_name, indexer in self.token_indexers.items():
             tensors[indexer_name] = indexer.as_padded_tensor_dict(
                 self._indexed_tokens[indexer_name], indexer_lengths[indexer_name]
             )
@@ -116,8 +134,9 @@ def as_tensor(self, padding_lengths: Dict[str, int]) -> TextFieldTensors:
     def empty_field(self):
         text_field = TextField([], self._token_indexers)
         text_field._indexed_tokens = {}
-        for indexer_name, indexer in self._token_indexers.items():
-            text_field._indexed_tokens[indexer_name] = indexer.get_empty_token_list()
+        if self._token_indexers is not None:
+            for indexer_name, indexer in self.token_indexers.items():
+                text_field._indexed_tokens[indexer_name] = indexer.get_empty_token_list()
         return text_field
 
     @overrides
@@ -139,18 +158,20 @@ def batch_tensors(self, tensor_list: List[TextFieldTensors]) -> TextFieldTensors
         return batched_tensors
 
     def __str__(self) -> str:
-        indexers = {
-            name: indexer.__class__.__name__ for name, indexer in self._token_indexers.items()
-        }
-
         # Double tab to indent under the header.
         formatted_text = "".join(
             "\t\t" + text + "\n" for text in textwrap.wrap(repr(self.tokens), 100)
         )
-        return (
-            f"TextField of length {self.sequence_length()} with "
-            f"text: \n {formatted_text} \t\tand TokenIndexers : {indexers}"
-        )
+        if self._token_indexers is not None:
+            indexers = {
+                name: indexer.__class__.__name__ for name, indexer in self._token_indexers.items()
+            }
+            return (
+                f"TextField of length {self.sequence_length()} with "
+                f"text: \n {formatted_text} \t\tand TokenIndexers : {indexers}"
+            )
+        else:
+            return f"TextField of length {self.sequence_length()} with text: \n {formatted_text}"
 
     # Sequence[Token] methods
     def __iter__(self) -> Iterator[Token]:
@@ -172,6 +193,9 @@ def duplicate(self):
         but it also fails in many cases since some tokenizers (like those used in
         the 'transformers' lib) cannot actually be deep-copied.
         """
-        new = TextField(deepcopy(self.tokens), {k: v for k, v in self._token_indexers.items()})
+        if self._token_indexers is not None:
+            new = TextField(deepcopy(self.tokens), {k: v for k, v in self._token_indexers.items()})
+        else:
+            new = TextField(deepcopy(self.tokens))
         new._indexed_tokens = deepcopy(self._indexed_tokens)
         return new
diff --git a/allennlp/data/image_loader.py b/allennlp/data/image_loader.py
new file mode 100644
index 00000000000..f5f081763c6
--- /dev/null
+++ b/allennlp/data/image_loader.py
@@ -0,0 +1,197 @@
+from os import PathLike
+from typing import Union, Sequence, Tuple, List, cast
+
+from overrides import overrides
+import torch
+import torchvision
+from torch import FloatTensor, IntTensor
+
+from allennlp.common.file_utils import cached_path
+from allennlp.common.registrable import Registrable
+
+OnePath = Union[str, PathLike]
+ManyPaths = Sequence[OnePath]
+ImagesWithSize = Tuple[FloatTensor, IntTensor]
+
+
+class ImageLoader(Registrable):
+    """
+    An `ImageLoader` is a callable that takes as input one or more filenames, and outputs two
+    tensors: one representing the images themselves, and one that just holds the sizes
+    of each image.
+
+    The first tensor is the images and is of shape `(batch_size, color_channels, height, width)`.
+    The second tensor is the sizes and is of shape `(batch_size, 2)`, where
+    the last dimension contains the height and width, respectively.
+
+    If only a single image is passed (as a `Path` or `str`, instead of a list) then
+    the batch dimension will be removed.
+
+    Subclasses only need to implement the `load()` method, which should load a single image
+    from a path.
+
+    # Parameters
+
+    size_divisibility : `int`, optional (default = `0`)
+        If set to a positive number, padding will be added so that the height
+        and width dimensions are divisible by `size_divisibility`.
+        Certain models may require this.
+
+    pad_value : `float`, optional (default = `0.0`)
+        The value to use for padding.
+
+    device : `Union[str, torch.device]`, optional (default = `"cpu"`)
+        A torch device identifier to put the image and size tensors on.
+    """
+
+    default_implementation = "torch"
+
+    def __init__(
+        self,
+        *,
+        size_divisibility: int = 0,
+        pad_value: float = 0.0,
+        device: Union[str, torch.device] = "cpu",
+    ) -> None:
+        self.size_divisibility = size_divisibility
+        self.pad_value = pad_value
+        self.device = device
+
+    def __call__(self, filename_or_filenames: Union[OnePath, ManyPaths]) -> ImagesWithSize:
+        if not isinstance(filename_or_filenames, (list, tuple)):
+            image, size = self([filename_or_filenames])  # type: ignore[list-item]
+            return cast(FloatTensor, image.squeeze(0)), cast(IntTensor, size.squeeze(0))
+
+        images: List[FloatTensor] = []
+        sizes: List[IntTensor] = []
+        for filename in filename_or_filenames:
+            image = self.load(cached_path(filename)).to(self.device)
+            size = cast(
+                IntTensor,
+                torch.tensor(
+                    [image.shape[-2], image.shape[-1]], dtype=torch.int32, device=self.device
+                ),
+            )
+            images.append(image)
+            sizes.append(size)
+        return self._pack_image_list(images, sizes)
+
+    def load(self, filename: OnePath) -> FloatTensor:
+        raise NotImplementedError()
+
+    def _pack_image_list(
+        self,
+        images: List[FloatTensor],
+        sizes: List[IntTensor],
+    ) -> ImagesWithSize:
+        """
+        A helper method that subclasses can use to turn a list of individual images into a padded
+        batch.
+        """
+        # Adapted from
+        # https://github.com/facebookresearch/detectron2/blob/master/detectron2/structures/image_list.py.
+
+        # shape: (batch_size, 2)
+        size_tensor = torch.stack(sizes)  # type: ignore[arg-type]
+
+        # shape: (2,)
+        max_size = size_tensor.max(0).values
+
+        if self.size_divisibility > 1:
+            # shape: (2,)
+            max_size = (
+                (max_size + self.size_divisibility - 1) // self.size_divisibility
+            ) * self.size_divisibility
+
+        batched_shape = [len(images)] + list(images[0].shape[:-2]) + list(max_size)
+
+        # shape: (batch_size, color_channels, max_height, max_width)
+        batched_images = images[0].new_full(batched_shape, self.pad_value)
+
+        for image, batch_slice, size in zip(images, batched_images, size_tensor):
+            batch_slice[..., : image.shape[-2], : image.shape[-1]].copy_(image)
+
+        return cast(FloatTensor, batched_images), cast(IntTensor, size_tensor)
+
+
+@ImageLoader.register("torch")
+class TorchImageLoader(ImageLoader):
+    """
+    This is just a wrapper around the default image loader from [torchvision]
+    (https://pytorch.org/docs/stable/torchvision/io.html#image).
+
+    # Parameters
+
+    image_backend : `Optional[str]`, optional (default = `None`)
+        Set the image backend. Can be one of `"PIL"` or `"accimage"`.
+    resize : `bool`, optional (default = `True`)
+        If `True` (the default), images will be resized when necessary according
+        to the values of `min_size` and `max_size`.
+    normalize: `bool`, optional (default = `True`)
+        If `True` (the default), images will be normalized according to the values
+        of `pixel_mean` and `pixel_std`.
+    min_size : `int`, optional (default = `800`)
+        If `resize` is `True`, images smaller than this will be resized up to `min_size`.
+    max_size : `int`, optional (default = `1333`)
+        If `resize` is `True`, images larger than this will be resized down to `max_size`.
+    pixel_mean : `Tuple[float, float, float]`, optional (default = `(0.485, 0.456, 0.406)`)
+        Mean values for image normalization. The defaults are reasonable for most models
+        from `torchvision`.
+    pixel_std : `Tuple[float, float, float]`, optional (default = `(0.229, 0.224, 0.225)`)
+        Standard deviation for image normalization. The defaults are reasonable for most
+        models from `torchvision`.
+    size_divisibility : `int`, optional (default = `32`)
+        Same parameter as with the `ImageLoader` base class, but the default here is
+        different.
+    """
+
+    def __init__(
+        self,
+        *,
+        image_backend: str = None,
+        resize: bool = True,
+        normalize: bool = True,
+        min_size: int = 800,
+        max_size: int = 1333,
+        pixel_mean: Tuple[float, float, float] = (0.485, 0.456, 0.406),
+        pixel_std: Tuple[float, float, float] = (0.229, 0.224, 0.225),
+        size_divisibility: int = 32,
+        **kwargs,
+    ) -> None:
+        super().__init__(size_divisibility=size_divisibility, **kwargs)
+        if image_backend is not None:
+            torchvision.set_image_backend(image_backend)
+        self.resize = resize
+        self.normalize = normalize
+        self.min_size = min_size
+        self.max_size = max_size
+        self.pixel_mean = pixel_mean
+        self.pixel_std = pixel_std
+
+    @overrides
+    def load(self, filename: OnePath) -> FloatTensor:
+        image = torchvision.io.read_image(filename).float().to(self.device) / 256
+        if self.normalize:
+            mean = torch.as_tensor(self.pixel_mean, dtype=image.dtype, device=self.device).view(
+                -1, 1, 1
+            )
+            std = torch.as_tensor(self.pixel_std, dtype=image.dtype, device=self.device).view(
+                -1, 1, 1
+            )
+            image = (image - mean) / std
+        if self.resize:
+            # Adapted from https://github.com/pytorch/vision/blob/
+            # 4521f6d152875974e317fa247a633e9ad1ea05c8/torchvision/models/detection/transform.py#L36.
+            min_size = min(image.shape[-2:])
+            max_size = max(image.shape[-2:])
+            scale_factor = self.min_size / min_size
+            if max_size * scale_factor > self.max_size:
+                scale_factor = self.max_size / max_size
+            image = torch.nn.functional.interpolate(
+                image[None],
+                scale_factor=scale_factor,
+                mode="bilinear",
+                recompute_scale_factor=True,
+                align_corners=False,
+            )[0]
+        return image
diff --git a/allennlp/data/instance.py b/allennlp/data/instance.py
index 2247814d1dc..0e1c2da6b9d 100644
--- a/allennlp/data/instance.py
+++ b/allennlp/data/instance.py
@@ -70,9 +70,9 @@ def index_fields(self, vocab: Vocabulary) -> None:
         indexed your instances, you might get unexpected behavior.
         """
         if not self.indexed:
-            self.indexed = True
             for field in self.fields.values():
                 field.index(vocab)
+            self.indexed = True
 
     def get_padding_lengths(self) -> Dict[str, Dict[str, int]]:
         """
diff --git a/allennlp/data/samplers/__init__.py b/allennlp/data/samplers/__init__.py
index 5cd46487d1d..8e13f8eab07 100644
--- a/allennlp/data/samplers/__init__.py
+++ b/allennlp/data/samplers/__init__.py
@@ -1,11 +1,3 @@
-from allennlp.data.samplers.samplers import (
-    Sampler,
-    BatchSampler,
-    SequentialSampler,
-    SubsetRandomSampler,
-    WeightedRandomSampler,
-    RandomSampler,
-    BasicBatchSampler,
-)
+from allennlp.data.samplers.batch_sampler import BatchSampler
 from allennlp.data.samplers.bucket_batch_sampler import BucketBatchSampler
 from allennlp.data.samplers.max_tokens_batch_sampler import MaxTokensBatchSampler
diff --git a/allennlp/data/samplers/batch_sampler.py b/allennlp/data/samplers/batch_sampler.py
new file mode 100644
index 00000000000..96907f4ac5d
--- /dev/null
+++ b/allennlp/data/samplers/batch_sampler.py
@@ -0,0 +1,19 @@
+from typing import List, Iterable, Sequence, Optional
+
+from allennlp.common.registrable import Registrable
+from allennlp.data.instance import Instance
+
+
+class BatchSampler(Registrable):
+    def get_batch_indices(self, instances: Sequence[Instance]) -> Iterable[List[int]]:
+        raise NotImplementedError
+
+    def get_num_batches(self, instances: Sequence[Instance]) -> int:
+        raise NotImplementedError
+
+    def get_batch_size(self) -> Optional[int]:
+        """
+        Not all `BatchSamplers` define a consistent `batch_size`, but those that
+        do should override this method.
+        """
+        return None
diff --git a/allennlp/data/samplers/bucket_batch_sampler.py b/allennlp/data/samplers/bucket_batch_sampler.py
index 2ff3a326e4e..d65a676f14c 100644
--- a/allennlp/data/samplers/bucket_batch_sampler.py
+++ b/allennlp/data/samplers/bucket_batch_sampler.py
@@ -1,14 +1,13 @@
 import logging
-from typing import List, Iterable, Tuple, Optional
-import random
 import math
-
-from torch.utils import data
+from typing import List, Iterable, Tuple, Sequence, Optional
+import random
 
 from allennlp.common.checks import ConfigurationError
 from allennlp.common.util import lazy_groups_of
 from allennlp.data.instance import Instance
-from allennlp.data.samplers import BatchSampler
+from allennlp.data.samplers.batch_sampler import BatchSampler
+
 
 logger = logging.getLogger(__name__)
 
@@ -30,13 +29,8 @@ class BucketBatchSampler(BatchSampler):
 
     # Parameters
 
-    data_source: `data.Dataset`, required
-        The pytorch `Dataset` of allennlp Instances to bucket.
-
-        In a typical AllenNLP configuration file, this parameter does not get an entry under the
-        "batch_sampler", it gets constructed separately.
     batch_size : `int`, required
-        The size of each batch of instances yielded when calling the dataloader.
+        The size of each batch of instances yielded when calling the data_loader.
 
     sorting_keys : `List[str]`, optional
         To bucket inputs into batches, we want to group the instances by padding length, so that we
@@ -48,7 +42,7 @@ class BucketBatchSampler(BatchSampler):
         padding keys and seeing which one has the longest length.  We use that one for padding.
         This should give reasonable results in most cases. Some cases where it might not be the
         right thing to do are when you have a `ListField[TextField]`, or when you have a really
-        long, constant length `ArrayField`.
+        long, constant length `TensorField`.
 
         When you need to specify this yourself, you can create an instance from your dataset and
         call `Instance.get_padding_lengths()` to see a list of all keys used in your data.  You
@@ -67,18 +61,14 @@ class BucketBatchSampler(BatchSampler):
 
     def __init__(
         self,
-        data_source: data.Dataset,
         batch_size: int,
         sorting_keys: List[str] = None,
         padding_noise: float = 0.1,
         drop_last: bool = False,
     ):
-
-        self.vocab = data_source.vocab
         self.sorting_keys = sorting_keys
         self.padding_noise = padding_noise
         self.batch_size = batch_size
-        self.data_source = data_source
         self.drop_last = drop_last
 
     def _argsort_by_padding(
@@ -115,8 +105,8 @@ def _argsort_by_padding(
             [instance_with_index[0][1] for instance_with_index in with_indices],
         )
 
-    def __iter__(self) -> Iterable[List[int]]:
-        indices, _ = self._argsort_by_padding(self.data_source)
+    def get_batch_indices(self, instances: Sequence[Instance]) -> Iterable[List[int]]:
+        indices, _ = self._argsort_by_padding(instances)
         batches = []
         for group in lazy_groups_of(indices, self.batch_size):
             batch_indices = list(group)
@@ -144,7 +134,6 @@ def _guess_sorting_keys(self, instances: Iterable[Instance], num_instances: int
         max_length = 0.0
         longest_field: Optional[str] = None
         for i, instance in enumerate(instances):
-            instance.index_fields(self.vocab)
             for field_name, field in instance.fields.items():
                 length = len(field)
                 if length > max_length:
@@ -163,9 +152,12 @@ def _guess_sorting_keys(self, instances: Iterable[Instance], num_instances: int
             )
         self.sorting_keys = [longest_field]
 
-    def __len__(self):
-        batch_count_float = len(self.data_source) / self.batch_size
+    def get_num_batches(self, instances: Sequence[Instance]) -> int:
+        batch_count_float = len(instances) / self.batch_size
         if self.drop_last:
             return math.floor(batch_count_float)
         else:
             return math.ceil(batch_count_float)
+
+    def get_batch_size(self) -> Optional[int]:
+        return self.batch_size
diff --git a/allennlp/data/samplers/max_tokens_batch_sampler.py b/allennlp/data/samplers/max_tokens_batch_sampler.py
index 917284a1433..8b70586bff4 100644
--- a/allennlp/data/samplers/max_tokens_batch_sampler.py
+++ b/allennlp/data/samplers/max_tokens_batch_sampler.py
@@ -1,9 +1,11 @@
 import logging
 import random
-from typing import List, Iterable, Iterator, TypeVar
+from typing import List, Iterable, Iterator, TypeVar, Sequence
+
+from allennlp.data.instance import Instance
+from allennlp.data.samplers.batch_sampler import BatchSampler
+from allennlp.data.samplers.bucket_batch_sampler import BucketBatchSampler
 
-from allennlp.data.samplers import BatchSampler, BucketBatchSampler
-from torch.utils import data
 
 logger = logging.getLogger(__name__)
 
@@ -23,9 +25,6 @@ class MaxTokensBatchSampler(BucketBatchSampler):
 
     # Parameters
 
-    data_source: `data.Dataset`
-        The pytorch `Dataset` of allennlp Instances to bucket.
-
     max_tokens : `int`
         The maximum number of tokens to include in a batch.
 
@@ -39,7 +38,7 @@ class MaxTokensBatchSampler(BucketBatchSampler):
         padding keys and seeing which one has the longest length.  We use that one for padding.
         This should give reasonable results in most cases. Some cases where it might not be the
         right thing to do are when you have a `ListField[TextField]`, or when you have a really
-        long, constant length `ArrayField`.
+        long, constant length `TensorField`.
 
         When you need to specify this yourself, you can create an instance from your dataset and
         call `Instance.get_padding_lengths()` to see a list of all keys used in your data.  You
@@ -53,13 +52,11 @@ class MaxTokensBatchSampler(BucketBatchSampler):
 
     def __init__(
         self,
-        data_source: data.Dataset,
         max_tokens: int,
         sorting_keys: List[str] = None,
         padding_noise: float = 0.1,
     ):
-        super().__init__(data_source, -1, sorting_keys, padding_noise, False)
-
+        super().__init__(-1, sorting_keys, padding_noise, False)
         self.max_tokens = max_tokens
 
     def _lazy_groups_of_max_size(
@@ -98,8 +95,8 @@ def _lazy_groups_of_max_size(
         if len(group) != 0:
             yield group
 
-    def __iter__(self) -> Iterable[List[int]]:
-        indices, lengths = self._argsort_by_padding(self.data_source)
+    def get_batch_indices(self, instances: Sequence[Instance]) -> Iterable[List[int]]:
+        indices, lengths = self._argsort_by_padding(instances)
 
         max_lengths = [max(length) for length in lengths]
         group_iterator = self._lazy_groups_of_max_size(indices, max_lengths)
@@ -109,6 +106,6 @@ def __iter__(self) -> Iterable[List[int]]:
         for batch in batches:
             yield batch
 
-    def __len__(self):
+    def get_num_batches(self, instances: Sequence[Instance]) -> int:
         # There is no easy way to count the number of batches, so we need to iterate and count.
-        return sum(1 for _ in self)
+        return sum(1 for _ in self.get_batch_indices(instances))
diff --git a/allennlp/data/samplers/samplers.py b/allennlp/data/samplers/samplers.py
deleted file mode 100644
index b045a29c12c..00000000000
--- a/allennlp/data/samplers/samplers.py
+++ /dev/null
@@ -1,162 +0,0 @@
-from typing import List, Iterable
-from torch.utils import data
-
-from allennlp.common.registrable import Registrable
-
-"""
-Duplicates of the pytorch Sampler classes. Broadly, these only exist
-so that we can add type hints, meaning we can construct them from configuration
-files. You can use these directly from Python code, but they are identical to the
-pytorch ones.
-"""
-
-
-class Sampler(Registrable):
-    """
-    A copy of the pytorch [Sampler](https://pytorch.org/docs/stable/_modules/torch/utils/data/sampler.html)
-    which allows us to register it with `Registrable.`
-    """
-
-    def __iter__(self) -> Iterable[int]:
-
-        raise NotImplementedError
-
-
-class BatchSampler(Registrable):
-    """
-    A copy of the pytorch
-    [BatchSampler](https://pytorch.org/docs/stable/data.html#torch.utils.data.BatchSampler)
-    which allows us to register it with `Registrable.`
-    """
-
-    def __iter__(self) -> Iterable[List[int]]:
-
-        raise NotImplementedError
-
-
-@Sampler.register("sequential")
-class SequentialSampler(data.SequentialSampler, Sampler):
-    """
-    A registrable version of pytorch's
-    [SequentialSampler](https://pytorch.org/docs/stable/data.html#torch.utils.data.SequentialSampler).
-
-    Registered as a `Sampler` with name "sequential".
-
-    In a typical AllenNLP configuration file, `data_source` parameter does not get an entry under
-    the "sampler", it gets constructed separately.
-    """
-
-    def __init__(self, data_source: data.Dataset):
-        super().__init__(data_source)
-
-
-@Sampler.register("random")
-class RandomSampler(data.RandomSampler, Sampler):
-    """
-    A registrable version of pytorch's
-    [RandomSampler](https://pytorch.org/docs/stable/data.html#torch.utils.data.RandomSampler).
-    Samples elements randomly. If without replacement, then sample from a shuffled dataset.
-    If with replacement, then user can specify `num_samples` to draw.
-
-    Registered as a `Sampler` with name "random".
-
-    # Parameters
-    data_source: `Dataset`, required
-        The dataset to sample from.
-
-        In a typical AllenNLP configuration file, this parameter does not get an entry under the
-        "sampler", it gets constructed separately.
-    replacement : `bool`, optional (default = `False`)
-        Samples are drawn with replacement if `True`.
-    num_samples: `int` (default = `len(dataset)`)
-        The number of samples to draw. This argument
-        is supposed to be specified only when `replacement` is ``True``.
-    """
-
-    def __init__(
-        self, data_source: data.Dataset, replacement: bool = False, num_samples: int = None
-    ):
-        super().__init__(data_source, replacement, num_samples)
-
-
-@Sampler.register("subset_random")
-class SubsetRandomSampler(data.SubsetRandomSampler, Sampler):
-    """
-    A registrable version of pytorch's
-    [SubsetRandomSampler](https://pytorch.org/docs/stable/data.html#torch.utils.data.SubsetRandomSampler).
-    Samples elements randomly from a given list of indices, without replacement.
-
-    Registered as a `Sampler` with name "subset_random".
-
-    # Parameters
-    indices: `List[int]`
-        a sequence of indices to sample from.
-    """
-
-    def __init__(self, indices: List[int]):
-        super().__init__(indices)
-
-
-@Sampler.register("weighted_random")
-class WeightedRandomSampler(data.WeightedRandomSampler, Sampler):
-    """
-    A registrable version of pytorch's
-    [WeightedRandomSampler](https://pytorch.org/docs/stable/data.html#torch.utils.data.WeightedRandomSampler).
-    Samples elements from `[0,...,len(weights)-1]` with given probabilities (weights).
-
-    Registered as a `Sampler` with name "weighted_random".
-
-    # Parameters:
-    weights : `List[float]`
-        A sequence of weights, not necessary summing up to one.
-    num_samples : `int`
-        The number of samples to draw.
-    replacement : `bool`
-        If ``True``, samples are drawn with replacement.
-        If not, they are drawn without replacement, which means that when a
-        sample index is drawn for a row, it cannot be drawn again for that row.
-
-    # Examples
-
-    ```python
-    >>> list(WeightedRandomSampler([0.1, 0.9, 0.4, 0.7, 3.0, 0.6], 5, replacement=True))
-    [0, 0, 0, 1, 0]
-    >>> list(WeightedRandomSampler([0.9, 0.4, 0.05, 0.2, 0.3, 0.1], 5, replacement=False))
-    [0, 1, 4, 3, 2]
-    ```
-    """
-
-    def __init__(self, weights: List[float], num_samples: int, replacement: bool = True):
-        super().__init__(weights, num_samples, replacement)
-
-
-@BatchSampler.register("basic")
-class BasicBatchSampler(data.BatchSampler, BatchSampler):
-    """
-    A registrable version of pytorch's
-    [BatchSampler](https://pytorch.org/docs/stable/data.html#torch.utils.data.BatchSampler).
-    Wraps another sampler to yield a mini-batch of indices.
-
-    Registered as a `BatchSampler` with name "basic".
-
-    # Parameters
-    sampler: `Sampler`
-        The base sampler.
-    batch_size : `int`
-        The size of the batch.
-    drop_last : `bool`
-        If `True`, the sampler will drop the last batch if
-        its size would be less than batch_size`.
-
-    # Examples
-
-    ```python
-    >>> list(BatchSampler(SequentialSampler(range(10)), batch_size=3, drop_last=False))
-    [[0, 1, 2], [3, 4, 5], [6, 7, 8], [9]]
-    >>> list(BatchSampler(SequentialSampler(range(10)), batch_size=3, drop_last=True))
-    [[0, 1, 2], [3, 4, 5], [6, 7, 8]]
-    ```
-    """
-
-    def __init__(self, sampler: Sampler, batch_size: int, drop_last: bool):
-        super().__init__(sampler, batch_size, drop_last)
diff --git a/allennlp/data/token_indexers/elmo_indexer.py b/allennlp/data/token_indexers/elmo_indexer.py
index 97dca0bf92a..c5e6c37d910 100644
--- a/allennlp/data/token_indexers/elmo_indexer.py
+++ b/allennlp/data/token_indexers/elmo_indexer.py
@@ -135,7 +135,7 @@ def tokens_to_indices(
     ) -> Dict[str, List[List[int]]]:
         # TODO(brendanr): Retain the token to index mappings in the vocabulary and remove this
 
-        # https://github.com/allenai/allennlp/blob/master/allennlp/data/token_indexers/wordpiece_indexer.py#L113
+        # https://github.com/allenai/allennlp/blob/main/allennlp/data/token_indexers/wordpiece_indexer.py#L113
 
         return {
             "elmo_tokens": [self._mapper.convert_word_to_char_ids(t.ensure_text()) for t in tokens]
diff --git a/allennlp/interpret/attackers/hotflip.py b/allennlp/interpret/attackers/hotflip.py
index 51b39b81390..a9d15db7615 100644
--- a/allennlp/interpret/attackers/hotflip.py
+++ b/allennlp/interpret/attackers/hotflip.py
@@ -194,6 +194,7 @@ def attack_from_json(
             whatever it was to `"she"`.
         """
         instance = self.predictor._json_to_instance(inputs)
+        self.predictor._dataset_reader.apply_token_indexers(instance)
         if target is None:
             output_dict = self.predictor._model.forward_on_instance(instance)
         else:
diff --git a/allennlp/models/__init__.py b/allennlp/models/__init__.py
index 8b84a4f9f86..af51339ba31 100644
--- a/allennlp/models/__init__.py
+++ b/allennlp/models/__init__.py
@@ -5,5 +5,6 @@
 
 from allennlp.models.model import Model
 from allennlp.models.archival import archive_model, load_archive, Archive
-from allennlp.models.simple_tagger import SimpleTagger
 from allennlp.models.basic_classifier import BasicClassifier
+from allennlp.models.multitask import MultiTaskModel
+from allennlp.models.simple_tagger import SimpleTagger
diff --git a/allennlp/models/heads/__init__.py b/allennlp/models/heads/__init__.py
new file mode 100644
index 00000000000..0108faf262f
--- /dev/null
+++ b/allennlp/models/heads/__init__.py
@@ -0,0 +1,2 @@
+from allennlp.models.heads.head import Head
+from allennlp.models.heads.classifier_head import ClassifierHead
diff --git a/allennlp/models/heads/classifier_head.py b/allennlp/models/heads/classifier_head.py
new file mode 100644
index 00000000000..fc9d281f297
--- /dev/null
+++ b/allennlp/models/heads/classifier_head.py
@@ -0,0 +1,133 @@
+from typing import Dict, Optional
+
+from overrides import overrides
+import torch
+
+from allennlp.data import Vocabulary
+from allennlp.models.heads.head import Head
+from allennlp.modules import FeedForward, Seq2VecEncoder
+from allennlp.training.metrics import CategoricalAccuracy
+
+
+@Head.register("classifier")
+class ClassifierHead(Head):
+    """
+    A classification `Head`.  Takes encoded text, gets a single vector out of it, runs an optional
+    feedforward layer on that vector, then classifies it into some label space.
+
+    Registered as a `Head` with name "classifier".
+
+    # Parameters
+
+    vocab : `Vocabulary`
+        Used to get the number of labels, if `num_labels` is not provided, and to translate label
+        indices to strings in `make_output_human_readable`.
+    seq2vec_encoder : `Seq2VecEncoder`
+        The input to this module is assumed to be a sequence of encoded vectors.  We use a
+        `Seq2VecEncoder` to compress this into a single vector on which we can perform
+        classification.
+    feedforward : `FeedForward`, optional, (default = `None`)
+        An optional feedforward layer to apply on the pooled output before performing the
+        classification.
+    input_dim : `int`, optional (default = `None`)
+        We need to know how many dimensions to use for the final classification weight matrix.  If
+        you have provided either a `seq2vec_encoder` or a `feedforward` module, we can get the
+        correct size from those objects.  If you use default values for both of those parameters,
+        then you must provide this parameter, so that we know the size of that encoding.
+    dropout : `float`, optional (default = `None`)
+        Dropout percentage to use.
+    num_labels : `int`, optional (default = `None`)
+        Number of labels to project to in classification layer. By default, the classification layer will
+        project to the size of the vocabulary namespace corresponding to labels.
+    label_namespace : `str`, optional (default = `"labels"`)
+        Vocabulary namespace corresponding to labels. By default, we use the "labels" namespace.
+    """
+
+    def __init__(
+        self,
+        vocab: Vocabulary,
+        seq2vec_encoder: Seq2VecEncoder,
+        feedforward: Optional[FeedForward] = None,
+        input_dim: int = None,
+        dropout: float = None,
+        num_labels: int = None,
+        label_namespace: str = "labels",
+    ) -> None:
+
+        super().__init__(vocab)
+        self._seq2vec_encoder = seq2vec_encoder
+        self._feedforward = feedforward
+        if self._feedforward is not None:
+            self._classifier_input_dim = self._feedforward.get_output_dim()
+        else:
+            self._classifier_input_dim = self._seq2vec_encoder.get_output_dim() or input_dim
+
+        if self._classifier_input_dim is None:
+            raise ValueError("No input dimension given!")
+
+        if dropout:
+            self._dropout = torch.nn.Dropout(dropout)
+        else:
+            self._dropout = None
+        self._label_namespace = label_namespace
+
+        if num_labels:
+            self._num_labels = num_labels
+        else:
+            self._num_labels = vocab.get_vocab_size(namespace=self._label_namespace)
+        self._classification_layer = torch.nn.Linear(self._classifier_input_dim, self._num_labels)
+        self._accuracy = CategoricalAccuracy()
+        self._loss = torch.nn.CrossEntropyLoss()
+
+    def forward(  # type: ignore
+        self,
+        encoded_text: torch.FloatTensor,
+        encoded_text_mask: torch.BoolTensor,
+        label: torch.IntTensor = None,
+    ) -> Dict[str, torch.Tensor]:
+        encoding = self._seq2vec_encoder(encoded_text, mask=encoded_text_mask)
+
+        if self._dropout:
+            encoding = self._dropout(encoding)
+
+        if self._feedforward is not None:
+            encoding = self._feedforward(encoding)
+
+        logits = self._classification_layer(encoding)
+        probs = torch.nn.functional.softmax(logits, dim=-1)
+
+        output_dict = {"logits": logits, "probs": probs}
+        if label is not None:
+            loss = self._loss(logits, label.long().view(-1))
+            output_dict["loss"] = loss
+            self._accuracy(logits, label)
+
+        return output_dict
+
+    @overrides
+    def make_output_human_readable(
+        self, output_dict: Dict[str, torch.Tensor]
+    ) -> Dict[str, torch.Tensor]:
+        """
+        Does a simple argmax over the probabilities, converts index to string label, and
+        add `"label"` key to the dictionary with the result.
+        """
+        if "probs" in output_dict:
+            predictions = output_dict["probs"]
+            if predictions.dim() == 2:
+                predictions_list = [predictions[i] for i in range(predictions.shape[0])]
+            else:
+                predictions_list = [predictions]
+            classes = []
+            for prediction in predictions_list:
+                label_idx = prediction.argmax(dim=-1).item()
+                label_str = self.vocab.get_index_to_token_vocabulary(self._label_namespace).get(
+                    label_idx, str(label_idx)
+                )
+                classes.append(label_str)
+            output_dict["label"] = classes
+        return output_dict
+
+    def get_metrics(self, reset: bool = False) -> Dict[str, float]:
+        metrics = {"accuracy": self._accuracy.get_metric(reset)}
+        return metrics
diff --git a/allennlp/models/heads/head.py b/allennlp/models/heads/head.py
new file mode 100644
index 00000000000..6fe45342aeb
--- /dev/null
+++ b/allennlp/models/heads/head.py
@@ -0,0 +1,21 @@
+from allennlp.models.model import Model
+
+
+class Head(Model):
+    """
+    A `Head` is a `Model` that takes _already encoded input_ and typically does simple computation
+    before returning a loss.
+
+    There isn't currently any difference in API between a `Model` and a `Head`, but we have this
+    separate type as both a signaling mechanism for what to expect when looking at a `Head` class,
+    and so that we can use this as a more informative type annotation when building models that use
+    `Heads` as inputs.
+
+    One additional consideration in a `Head` is that `make_output_human_readable` needs to account
+    for the case where it gets called without first having `forward` be called on the head.  This is
+    because at the point where we call `make_output_human_readable`, we don't know which heads were
+    used in `forward`, and trying to save the state is messy.  So just make sure that you always
+    have conditional logic in `make_output_human_readable` when you implement a `Head`.
+    """
+
+    pass
diff --git a/allennlp/models/multitask.py b/allennlp/models/multitask.py
new file mode 100644
index 00000000000..90cc76457c5
--- /dev/null
+++ b/allennlp/models/multitask.py
@@ -0,0 +1,197 @@
+from collections import defaultdict
+import inspect
+from typing import Any, Dict, List, Set, Union, Mapping
+
+from overrides import overrides
+import torch
+
+from allennlp.data import Vocabulary
+from allennlp.modules import Backbone
+from allennlp.models.model import Model
+from allennlp.models.heads import Head
+from allennlp.nn import InitializerApplicator
+
+
+def get_forward_arguments(module: torch.nn.Module) -> Set[str]:
+    signature = inspect.signature(module.forward)
+    return set([arg for arg in signature.parameters if arg != "self"])
+
+
+@Model.register("multitask")
+class MultiTaskModel(Model):
+    """
+    A `MultiTaskModel` consists of a `Backbone` that encodes its inputs in some way, then a
+    collection of `Heads` that make predictions from the backbone-encoded inputs. The predictions
+    of each `Head` are combined to compute a joint loss, which is then used for training.
+
+    This model works by taking `**kwargs` in `forward`, and passing the right arguments from that to
+    the backbone and to each head. By default, we use `inspect` to try to figure out getting the
+    right arguments to the right modules, but we allow you to specify these arguments yourself in
+    case our inference code gets it wrong.
+
+    It is the caller's responsibility to make sure that the backbone and all heads are compatible with
+    each other, and with the input data that comes from a `MultiTaskDatasetReader`. We give some
+    arguments in this class and in `MultiTaskDatasetReader` to help with plumbing the arguments in
+    complex cases (e.g., you can change argument names so that they match what the backbone and
+    heads expect).
+
+    # Parameters
+
+    vocab: `Vocab`
+    backbone: `Backbone`
+    heads: `Dict[str, Head]`
+    loss_weights: `Dict[str, float]`, optional (default = `equal weighting`)
+        If you want, you can specify a weight for each head, which we will multiply the loss by when
+        aggregating across heads. This is equivalent in many cases to specifying a separate
+        learning rate per head, and just putting a weighting on the loss is much easier than
+        figuring out the right way to specify that in the optimizer.
+    arg_name_mapping: `Dict[str, Dict[str, str]]`, optional (default = `identity mapping`)
+        The mapping changes the names in the `**kwargs` dictionary passed to `forward` before
+        passing on the arguments to the backbone and heads. This is keyed by component, and the
+        top-level keys must match the keys passed in the `heads` parameter, plus a "backbone" key
+        for the backbone. If you are using dataset readers that use dataset-specific names for
+        their keys, this lets you change them to be consistent. For example, this dictionary might
+        end up looking like this: `{"backbone": {"question": "text", "review": "text"},
+        "classifier1": {"sentiment": "label"}, "classifier2": {"topic": "label"}}`.
+        Though in this particular example, we have two different inputs mapping to the same key in
+        the backbone; this will work, as long are you are careful that you don't give both of those
+        inputs in the same batch. If we see overlapping keys, we will crash. If you want to be able
+        to do this kind of mixed training in the same batch, you need to handle that in your data
+        code, not here; we won't handle complex batching inside this model.
+    allowed_arguments: `Dict[str, Set[str]]`, optional (default = `inferred`)
+        The list of arguments that should be passed from `**kwargs` to the `forward` method for the
+        backbone and each head. If you provide this, the keys in here should match the keys given
+        in the `heads` parameter, plus a "backbone" key for the backbone arguments. If not given,
+        we will use the `inspect` module to figure this out. The only time that this inference
+        might fail is if you have optional arguments that you want to be ignored, or
+        something. You very likely don't need to worry about this argument.
+    initializer: `InitializerApplicator`, optional (default=`InitializerApplicator()`)
+        If provided, will be used to initialize the model parameters.
+    """
+
+    def __init__(
+        self,
+        vocab: Vocabulary,
+        backbone: Backbone,
+        heads: Dict[str, Head],
+        *,
+        loss_weights: Dict[str, float] = None,
+        arg_name_mapping: Dict[str, Dict[str, str]] = None,
+        allowed_arguments: Dict[str, Set[str]] = None,
+        initializer: InitializerApplicator = InitializerApplicator(),
+        **kwargs,
+    ):
+        super().__init__(vocab, **kwargs)
+        self._backbone = backbone
+        self._heads = torch.nn.ModuleDict(heads)
+        self._heads_called: Set[str] = set()
+        self._arg_name_mapping = arg_name_mapping or defaultdict(dict)
+
+        self._allowed_arguments = allowed_arguments or {
+            "backbone": get_forward_arguments(backbone),
+            **{key: get_forward_arguments(heads[key]) for key in heads},
+        }
+        self._loss_weights = loss_weights or defaultdict(lambda: 1.0)
+        initializer(self)
+
+    def forward(self, **kwargs) -> Dict[str, torch.Tensor]:  # type: ignore
+        if "task" not in kwargs:
+            raise ValueError(
+                "Instances for multitask training need to contain a MetadataField with "
+                "the name 'task' to indicate which task they belong to. Usually the "
+                "MultitaskDataLoader provides this field and you don't have to do anything."
+            )
+
+        task_indices_just_for_mypy: Mapping[str, List[int]] = defaultdict(lambda: [])
+        for i, task in enumerate(kwargs["task"]):
+            task_indices_just_for_mypy[task].append(i)
+        task_indices: Dict[str, torch.LongTensor] = {
+            task: torch.LongTensor(indices) for task, indices in task_indices_just_for_mypy.items()
+        }
+
+        def make_inputs_for_task(task: str, whole_batch_input: Union[torch.Tensor, List]):
+            if isinstance(whole_batch_input, torch.Tensor):
+                task_indices[task] = task_indices[task].to(whole_batch_input.device)
+                return torch.index_select(whole_batch_input, 0, task_indices[task])
+            else:
+                return [whole_batch_input[i] for i in task_indices[task]]
+
+        backbone_arguments = self._get_arguments(kwargs, "backbone")
+        backbone_outputs = self._backbone(**backbone_arguments)
+        combined_arguments = {**backbone_outputs, **kwargs}
+
+        outputs = {**backbone_outputs}
+        loss = None
+        for head_name in self._heads:
+            if head_name not in task_indices:
+                continue
+
+            head_arguments = self._get_arguments(combined_arguments, head_name)
+            head_arguments = {
+                key: make_inputs_for_task(head_name, value) for key, value in head_arguments.items()
+            }
+
+            head_outputs = self._heads[head_name](**head_arguments)
+            for key in head_outputs:
+                outputs[f"{head_name}_{key}"] = head_outputs[key]
+
+            if "loss" in head_outputs:
+                self._heads_called.add(head_name)
+                head_loss = self._loss_weights[head_name] * head_outputs["loss"]
+                if loss is None:
+                    loss = head_loss
+                else:
+                    loss += head_loss
+
+        if loss is not None:
+            outputs["loss"] = loss
+
+        return outputs
+
+    def _get_arguments(self, available_args: Dict[str, Any], component: str) -> Dict[str, Any]:
+        """
+        Given a list of things we might want to pass to a component (where "component" is either the
+        backbone or a head), this method figures out which things we should actually pass, by
+        mapping names and looking at allowed arguments.
+        """
+        allowed_args = self._allowed_arguments[component]
+        name_mapping = self._arg_name_mapping.get(component, {})
+        kept_arguments = {}
+        for key, value in available_args.items():
+            new_key = name_mapping.get(key, key)
+            if new_key in allowed_args:
+                if new_key in kept_arguments:
+                    raise ValueError(
+                        f"Got duplicate argument {new_key} for {component}. This likely means that"
+                        " you mapped multiple inputs to the same name. This is generally ok for"
+                        " the backbone, but you have to be sure each batch only gets one of those"
+                        " inputs. This is typically not ok for heads, and means something is not"
+                        " set up right."
+                    )
+                kept_arguments[new_key] = value
+        return kept_arguments
+
+    @overrides
+    def get_metrics(self, reset: bool = False) -> Dict[str, float]:
+        metrics = {}
+        for head_name in self._heads_called:
+            for key, value in self._heads[head_name].get_metrics(reset).items():
+                metrics[f"{head_name}_{key}"] = value
+        if reset:
+            self._heads_called.clear()
+        return metrics
+
+    @overrides
+    def make_output_human_readable(
+        self, output_dict: Dict[str, torch.Tensor]
+    ) -> Dict[str, torch.Tensor]:
+        output_dict = self._backbone.make_output_human_readable(output_dict)
+        for head_name, head in self._heads.items():
+            head_outputs = {}
+            for key, value in output_dict.items():
+                if key.startswith(head_name):
+                    head_outputs[key.replace(f"{head_name}_", "")] = value
+            readable_head_outputs = head.make_output_human_readable(head_outputs)
+            for key, value in readable_head_outputs.items():
+                output_dict[f"{head_name}_{key}"] = value
+        return output_dict
diff --git a/allennlp/modules/__init__.py b/allennlp/modules/__init__.py
index 2292ceabd73..0e47f36d0f6 100644
--- a/allennlp/modules/__init__.py
+++ b/allennlp/modules/__init__.py
@@ -5,6 +5,7 @@
 """
 
 from allennlp.modules.attention import Attention
+from allennlp.modules.backbones import Backbone
 from allennlp.modules.bimpm_matching import BiMpmMatching
 from allennlp.modules.conditional_random_field import ConditionalRandomField
 from allennlp.modules.elmo import Elmo
diff --git a/allennlp/modules/attention/__init__.py b/allennlp/modules/attention/__init__.py
index cc806b2252b..ba9ba3ad021 100644
--- a/allennlp/modules/attention/__init__.py
+++ b/allennlp/modules/attention/__init__.py
@@ -4,3 +4,4 @@
 from allennlp.modules.attention.cosine_attention import CosineAttention
 from allennlp.modules.attention.dot_product_attention import DotProductAttention
 from allennlp.modules.attention.linear_attention import LinearAttention
+from allennlp.modules.attention.scaled_dot_product_attention import ScaledDotProductAttention
diff --git a/allennlp/modules/attention/additive_attention.py b/allennlp/modules/attention/additive_attention.py
index ca1497733ad..a10947aa3e4 100644
--- a/allennlp/modules/attention/additive_attention.py
+++ b/allennlp/modules/attention/additive_attention.py
@@ -13,7 +13,8 @@ class AdditiveAttention(Attention):
     `x` and the matrix `y` is computed as `V tanh(Wx + Uy)`.
 
     This attention is often referred as concat or additive attention. It was introduced in
-    <https://arxiv.org/abs/1409.0473> by Bahdanau et al.
+    [Neural Machine Translation by Jointly Learning to Align and Translate (Bahdanau et al, 2015)]
+    (https://api.semanticscholar.org/CorpusID:11212020).
 
     Registered as an `Attention` with name "additive".
 
diff --git a/allennlp/modules/attention/dot_product_attention.py b/allennlp/modules/attention/dot_product_attention.py
index 822fc50c277..3a4466296eb 100644
--- a/allennlp/modules/attention/dot_product_attention.py
+++ b/allennlp/modules/attention/dot_product_attention.py
@@ -8,6 +8,9 @@ class DotProductAttention(Attention):
     """
     Computes attention between a vector and a matrix using dot product.
 
+    Reference: [Attention Is All You Need (Vaswani et al, 2017)]
+    (https://api.semanticscholar.org/CorpusID:13756489)
+
     Registered as an `Attention` with name "dot_product".
     """
 
diff --git a/allennlp/modules/attention/scaled_dot_product_attention.py b/allennlp/modules/attention/scaled_dot_product_attention.py
new file mode 100644
index 00000000000..36ecf592887
--- /dev/null
+++ b/allennlp/modules/attention/scaled_dot_product_attention.py
@@ -0,0 +1,33 @@
+import math
+import torch
+from overrides import overrides
+from allennlp.modules.attention.attention import Attention
+
+
+@Attention.register("scaled_dot_product")
+class ScaledDotProductAttention(Attention):
+    """
+    Computes attention between two tensors using scaled dot product.
+    # Reference: [Attention Is All You Need (Vaswani et al, 2017)]
+    # (https://api.semanticscholar.org/CorpusID:13756489)
+
+    Registered as an `Attention` with name "scaled_dot_product".
+
+    # Parameters
+
+    scaling_factor : `int`, required
+        The similarity score is scaled down by the `scaling_factor`.
+    normalize : `bool`, optional (default=`True`)
+        If true, we normalize the computed similarities with a softmax, to return a probability
+        distribution for your attention.  If false, this is just computing a similarity score.
+    """
+
+    def __init__(self, scaling_factor: int, normalize: bool = True) -> None:
+        super().__init__(normalize)
+        self.scaling_factor = scaling_factor
+
+    @overrides
+    def _forward_internal(self, vector: torch.Tensor, matrix: torch.Tensor) -> torch.Tensor:
+        scores = torch.matmul(vector, matrix)
+        scores = scores / math.sqrt(self.scaling_factor)
+        return scores
diff --git a/allennlp/modules/backbones/__init__.py b/allennlp/modules/backbones/__init__.py
new file mode 100644
index 00000000000..050d67fd2e1
--- /dev/null
+++ b/allennlp/modules/backbones/__init__.py
@@ -0,0 +1,3 @@
+from allennlp.modules.backbones.backbone import Backbone
+from allennlp.modules.backbones.pretrained_transformer_backbone import PretrainedTransformerBackbone
+from allennlp.modules.backbones.vilbert_backbone import VilbertBackbone
diff --git a/allennlp/modules/backbones/backbone.py b/allennlp/modules/backbones/backbone.py
new file mode 100644
index 00000000000..e4bb14f605b
--- /dev/null
+++ b/allennlp/modules/backbones/backbone.py
@@ -0,0 +1,41 @@
+from typing import Dict
+
+import torch
+
+from allennlp.common import Registrable
+
+
+class Backbone(Registrable, torch.nn.Module):
+    """
+    A `Backbone` operates on basic model inputs and produces some encoding of those inputs that will
+    be shared among one or more `Heads` in a multi-task setting.  For plain text inputs, this is
+    often a transformer.
+
+    The main purpose of this class is to give us a `Registrable` class that we can use as a type
+    annotation on `Model` classes that want to use a backbone.  The expectation is that this will
+    take the same inputs as a typical model, but return intermediate representations.  These should
+    generally be returned as a dictionary, from which the caller will have to pull out what they
+    want and use as desired.  As a convention that these modules should generally follow, their
+    outputs should have the same name as the given input, prepended with `encoded_`.  So, a backbone
+    that encodes a `text` input should return an output called `encoded_text`.  This convention
+    allows easier exchangeability of these backbone modules.
+
+    Additionally, as downstream `Heads` will typically need mask information, but after encoding
+    have no way of computing it, a `Backbone` should also return a mask for each of its outputs,
+    with the same name as the output but with `_mask` appended.  So in our example of `text` as
+    input, the output should have an entry called `encoded_text_mask`.
+
+    Because a `Backbone` handles model inputs, if you want to make those inputs human readable
+    (e.g., for displaying them in a demo), then it's typically only the `Backbone` object that knows
+    how to do that.  So we also implement the `make_output_human_readable` function from the `Model`
+    class.  The implementation in the base class does nothing, but concrete classes should generally
+    convert whatever input indices are saved to the output into text.
+    """
+
+    def forward(self, **kwargs) -> Dict[str, torch.Tensor]:
+        raise NotImplementedError
+
+    def make_output_human_readable(
+        self, output_dict: Dict[str, torch.Tensor]
+    ) -> Dict[str, torch.Tensor]:
+        return output_dict
diff --git a/allennlp/modules/backbones/pretrained_transformer_backbone.py b/allennlp/modules/backbones/pretrained_transformer_backbone.py
new file mode 100644
index 00000000000..365bca699ec
--- /dev/null
+++ b/allennlp/modules/backbones/pretrained_transformer_backbone.py
@@ -0,0 +1,117 @@
+from typing import Dict, Optional
+
+from overrides import overrides
+import torch
+
+from allennlp.data.fields.text_field import TextFieldTensors
+from allennlp.data.vocabulary import Vocabulary
+from allennlp.modules.backbones.backbone import Backbone
+from allennlp.modules.token_embedders.pretrained_transformer_embedder import (
+    PretrainedTransformerEmbedder,
+)
+from allennlp.nn import util
+
+
+@Backbone.register("pretrained_transformer")
+class PretrainedTransformerBackbone(Backbone):
+    """
+    Uses a pretrained model from `transformers` as a `Backbone`.
+
+    This class passes most of its arguments to a `PretrainedTransformerEmbedder`, which it uses to
+    implement the underlying encoding logic (we duplicate the arguments here instead of taking an
+    `Embedder` as a constructor argument just to simplify the user-facing API).
+
+    Registered as a `Backbone` with name "pretrained_transformer".
+
+    # Parameters
+
+    vocab : `Vocabulary`
+        Necessary for converting input ids to strings in `make_output_human_readable`.  If you set
+        `output_token_strings` to `False`, or if you never call `make_output_human_readable`, then
+        this will not be used and can be safely set to `None`.
+    model_name : `str`
+        The name of the `transformers` model to use. Should be the same as the corresponding
+        `PretrainedTransformerIndexer`.
+    max_length : `int`, optional (default = `None`)
+        If positive, folds input token IDs into multiple segments of this length, pass them
+        through the transformer model independently, and concatenate the final representations.
+        Should be set to the same value as the `max_length` option on the
+        `PretrainedTransformerIndexer`.
+    sub_module: `str`, optional (default = `None`)
+        The name of a submodule of the transformer to be used as the embedder. Some transformers naturally act
+        as embedders such as BERT. However, other models consist of encoder and decoder, in which case we just
+        want to use the encoder.
+    train_parameters: `bool`, optional (default = `True`)
+        If this is `True`, the transformer weights get updated during training.
+    last_layer_only: `bool`, optional (default = `True`)
+        When `True` (the default), only the final layer of the pretrained transformer is taken
+        for the embeddings. But if set to `False`, a scalar mix of all of the layers
+        is used.
+    output_token_strings : `bool`, optional (default = `True`)
+        If `True`, we will add the input token ids to the output dictionary in `forward` (with key
+        "token_ids"), and convert them to strings in `make_output_human_readable` (with key
+        "tokens").  This is necessary for certain demo functionality, and it adds only a trivial
+        amount of computation if you are not using a demo.
+    vocab_namespace : `str`, optional (default = `"tags"`)
+        The namespace to use in conjunction with the `Vocabulary` above.  We use a somewhat
+        confusing default of "tags" here, to match what is done in `PretrainedTransformerIndexer`.
+    """
+
+    def __init__(
+        self,
+        vocab: Vocabulary,
+        model_name: str,
+        *,
+        max_length: int = None,
+        sub_module: str = None,
+        train_parameters: bool = True,
+        last_layer_only: bool = True,
+        override_weights_file: Optional[str] = None,
+        override_weights_strip_prefix: Optional[str] = None,
+        output_token_strings: bool = True,
+        vocab_namespace: str = "tags",
+    ) -> None:
+        super().__init__()
+        self._vocab = vocab
+        self._namespace = vocab_namespace
+        self._embedder = PretrainedTransformerEmbedder(
+            model_name=model_name,
+            max_length=max_length,
+            sub_module=sub_module,
+            train_parameters=train_parameters,
+            last_layer_only=last_layer_only,
+            override_weights_file=override_weights_file,
+            override_weights_strip_prefix=override_weights_strip_prefix,
+        )
+        self._output_token_strings = output_token_strings
+
+    def forward(self, text: TextFieldTensors) -> Dict[str, torch.Tensor]:  # type: ignore
+        if len(text) != 1:
+            raise ValueError(
+                "PretrainedTransformerBackbone is only compatible with using a single TokenIndexer"
+            )
+        text_inputs = next(iter(text.values()))
+        mask = util.get_text_field_mask(text)
+        encoded_text = self._embedder(**text_inputs)
+        outputs = {"encoded_text": encoded_text, "encoded_text_mask": mask}
+        if self._output_token_strings:
+            outputs["token_ids"] = util.get_token_ids_from_text_field_tensors(text)
+        return outputs
+
+    @overrides
+    def make_output_human_readable(
+        self, output_dict: Dict[str, torch.Tensor]
+    ) -> Dict[str, torch.Tensor]:
+        if not self._output_token_strings:
+            return output_dict
+
+        tokens = []
+        for instance_tokens in output_dict["token_ids"]:
+            tokens.append(
+                [
+                    self._vocab.get_token_from_index(token_id.item(), namespace=self._namespace)
+                    for token_id in instance_tokens
+                ]
+            )
+        output_dict["tokens"] = tokens
+        return output_dict
diff --git a/allennlp/modules/backbones/vilbert_backbone.py b/allennlp/modules/backbones/vilbert_backbone.py
new file mode 100644
index 00000000000..99f790d1896
--- /dev/null
+++ b/allennlp/modules/backbones/vilbert_backbone.py
@@ -0,0 +1,234 @@
+import logging
+from typing import Dict, List
+
+import torch
+from overrides import overrides
+
+from allennlp.data.fields.text_field import TextFieldTensors
+from allennlp.data.vocabulary import Vocabulary
+from allennlp.modules.backbones.backbone import Backbone
+from allennlp.modules.transformer import BiModalEncoder, ImageFeatureEmbeddings, Embeddings
+
+logger = logging.getLogger(__name__)
+
+
+@Backbone.register("vilbert")
+@Backbone.register("vilbert_from_huggingface", constructor="from_huggingface_model_name")
+class VilbertBackbone(Backbone):
+    """
+    Uses a Vilbert model as a `Backbone`.
+    Registered as a `Backbone` with name "vilbert".
+    """
+
+    def __init__(
+        self,
+        vocab: Vocabulary,
+        text_embeddings: Embeddings,
+        image_embeddings: ImageFeatureEmbeddings,
+        encoder: BiModalEncoder,
+        pooled_output_dim: int,
+        fusion_method: str = "sum",
+        dropout: float = 0.1,
+        vocab_namespace: str = "tokens",
+    ) -> None:
+        super().__init__()
+        self.fusion_method = fusion_method
+        self.text_embeddings = text_embeddings
+        self.image_embeddings = image_embeddings
+        self.encoder = encoder
+        from allennlp.modules.transformer import TransformerPooler
+
+        self.t_pooler = TransformerPooler(encoder.hidden_size1, pooled_output_dim)
+        self.v_pooler = TransformerPooler(encoder.hidden_size2, pooled_output_dim)
+        self.dropout = torch.nn.Dropout(dropout)
+
+        self._vocab = vocab
+        self._namespace = vocab_namespace
+
+    @classmethod
+    def from_huggingface_model_name(
+        cls,
+        vocab: Vocabulary,
+        model_name: str,
+        image_feature_dim: int,
+        image_num_hidden_layers: int,
+        image_hidden_size: int,
+        image_num_attention_heads: int,
+        combined_hidden_size: int,
+        combined_num_attention_heads: int,
+        pooled_output_dim: int,
+        image_intermediate_size: int,
+        image_attention_dropout: float,
+        image_hidden_dropout: float,
+        image_biattention_id: List[int],
+        text_biattention_id: List[int],
+        text_fixed_layer: int,
+        image_fixed_layer: int,
+        fusion_method: str = "sum",
+    ):
+        from transformers import AutoModel
+
+        transformer = AutoModel.from_pretrained(model_name)
+
+        from copy import deepcopy
+
+        text_embeddings = deepcopy(transformer.embeddings)
+
+        # Albert (and maybe others?) has this "embedding_size", that's different from "hidden_size".
+        # To get them to the same dimensionality, it uses a linear transform after the embedding
+        # layer, which we need to pull out and copy here.
+        if hasattr(transformer.config, "embedding_size"):
+            config = transformer.config
+
+            from transformers.models.albert.modeling_albert import AlbertModel
+
+            if isinstance(transformer, AlbertModel):
+                linear_transform = deepcopy(transformer.encoder.embedding_hidden_mapping_in)
+            else:
+                logger.warning(
+                    "Unknown model that uses separate embedding size; weights of the linear "
+                    f"transform will not be initialized.  Model type is: {transformer.__class__}"
+                )
+                linear_transform = torch.nn.Linear(config.embedding_dim, config.hidden_dim)
+
+            # We can't just use torch.nn.Sequential here, even though that's basically all this is,
+            # because Sequential doesn't accept *inputs, only a single argument.
+
+            class EmbeddingsShim(torch.nn.Module):
+                def __init__(self, embeddings: torch.nn.Module, linear_transform: torch.nn.Module):
+                    super().__init__()
+                    self.linear_transform = linear_transform
+                    self.embeddings = embeddings
+
+                def forward(self, *inputs, **kwargs):
+                    return self.linear_transform(self.embeddings(*inputs, **kwargs))
+
+            text_embeddings = EmbeddingsShim(text_embeddings, linear_transform)
+
+        image_embeddings = ImageFeatureEmbeddings(
+            feature_size=image_feature_dim,
+            embedding_size=image_hidden_size,
+            dropout=image_hidden_dropout,
+        )
+
+        encoder = BiModalEncoder.from_pretrained_module(
+            pretrained_module=transformer,
+            num_hidden_layers2=image_num_hidden_layers,
+            hidden_size2=image_hidden_size,
+            num_attention_heads2=image_num_attention_heads,
+            combined_hidden_size=combined_hidden_size,
+            combined_num_attention_heads=combined_num_attention_heads,
+            intermediate_size2=image_intermediate_size,
+            attention_dropout2=image_attention_dropout,
+            hidden_dropout2=image_hidden_dropout,
+            biattention_id1=text_biattention_id,
+            biattention_id2=image_biattention_id,
+            fixed_layer1=text_fixed_layer,
+            fixed_layer2=image_fixed_layer,
+        )
+        return cls(
+            vocab=vocab,
+            text_embeddings=text_embeddings,
+            image_embeddings=image_embeddings,
+            encoder=encoder,
+            pooled_output_dim=pooled_output_dim,
+            fusion_method=fusion_method,
+        )
+
+    @overrides
+    def forward(
+        self,  # type: ignore
+        box_features: torch.Tensor,
+        box_coordinates: torch.Tensor,
+        box_mask: torch.Tensor,
+        text: TextFieldTensors,
+    ) -> Dict[str, torch.Tensor]:
+        batch_size, _, feature_size = box_features.size()
+
+        if "token_ids" in text["tokens"]:
+            token_ids = text["tokens"]["token_ids"]
+        else:
+            token_ids = text["tokens"]["tokens"]
+
+        # Shape: (batch_size, num_tokens)
+        token_type_ids = text["tokens"].get("type_ids")
+        # Shape: (batch_size, num_tokens)
+        attention_mask = text["tokens"].get("mask")
+
+        # Shape: (batch_size, num_tokens, embedding_dim)
+        embedding_output = self.text_embeddings(token_ids, token_type_ids)
+        num_tokens = embedding_output.size(1)
+
+        # this attention mask is more simple than the triangular masking of
+        # causal attention used in OpenAI GPT, we just need to prepare the
+        # broadcast dimension here.
+        if attention_mask is not None:
+            extended_attention_mask = attention_mask
+        else:
+            extended_attention_mask = None
+
+        extended_image_attention_mask = box_mask
+
+        # Shape: (batch_size, feature_size, num_tokens)
+        # TODO (epwalsh): Why all zeros?? This doesn't seem right.
+        extended_co_attention_mask = torch.zeros(
+            batch_size,
+            feature_size,
+            num_tokens,
+            dtype=extended_image_attention_mask.dtype,
+        )
+
+        # Shape: (batch_size, num_boxes, image_embedding_dim)
+        v_embedding_output = self.image_embeddings(box_features, box_coordinates)
+
+        encoded_layers_t, encoded_layers_v = self.encoder(
+            embedding_output,
+            v_embedding_output,
+            extended_attention_mask,
+            extended_image_attention_mask,
+            extended_co_attention_mask,
+        )
+
+        # Shape: (batch_size, num_tokens, embedding_dim)
+        sequence_output_t = encoded_layers_t[:, :, :, -1]
+        # Shape: (batch_size, num_boxes, image_embedding_dim)
+        sequence_output_v = encoded_layers_v[:, :, :, -1]
+
+        # Shape: (batch_size, pooled_output_dim)
+        pooled_output_t = self.t_pooler(sequence_output_t)
+        # Shape: (batch_size, pooled_output_dim)
+        pooled_output_v = self.v_pooler(sequence_output_v)
+
+        if self.fusion_method == "sum":
+            pooled_output = self.dropout(pooled_output_t + pooled_output_v)
+        elif self.fusion_method == "mul":
+            pooled_output = self.dropout(pooled_output_t * pooled_output_v)
+        else:
+            raise ValueError(f"Fusion method '{self.fusion_method}' not supported")
+
+        return {
+            "encoded_boxes": sequence_output_v,
+            "encoded_boxes_mask": box_mask,
+            "encoded_boxes_pooled": pooled_output_v,
+            "encoded_text": sequence_output_t,
+            "encoded_text_mask": attention_mask,
+            "encoded_text_pooled": pooled_output_t,
+            "pooled_boxes_and_text": pooled_output,
+        }
+
+    @overrides
+    def make_output_human_readable(
+        self, output_dict: Dict[str, torch.Tensor]
+    ) -> Dict[str, torch.Tensor]:
+        tokens = []
+        for instance_tokens in output_dict[
+            "token_ids"
+        ]:  # TODO: do we even have "token_ids" in the output?
+            tokens.append(
+                [
+                    self._vocab.get_token_from_index(token_id.item(), namespace=self._namespace)
+                    for token_id in instance_tokens
+                ]
+            )
+        output_dict["tokens"] = tokens
+        return output_dict
diff --git a/allennlp/modules/elmo.py b/allennlp/modules/elmo.py
index 8c45479f374..1061a8fbdc4 100644
--- a/allennlp/modules/elmo.py
+++ b/allennlp/modules/elmo.py
@@ -12,7 +12,9 @@
 from allennlp.common.checks import ConfigurationError
 from allennlp.common.file_utils import cached_path
 from allennlp.common.util import lazy_groups_of
-from allennlp.data import Instance, Token, Vocabulary
+from allennlp.data.instance import Instance
+from allennlp.data.tokenizers.token_class import Token
+from allennlp.data.vocabulary import Vocabulary
 from allennlp.data.batch import Batch
 from allennlp.data.fields import TextField
 from allennlp.data.token_indexers.elmo_indexer import (
diff --git a/allennlp/modules/text_field_embedders/basic_text_field_embedder.py b/allennlp/modules/text_field_embedders/basic_text_field_embedder.py
index b56b70cf0f1..477dd8f1086 100644
--- a/allennlp/modules/text_field_embedders/basic_text_field_embedder.py
+++ b/allennlp/modules/text_field_embedders/basic_text_field_embedder.py
@@ -5,7 +5,7 @@
 from overrides import overrides
 
 from allennlp.common.checks import ConfigurationError
-from allennlp.data import TextFieldTensors
+from allennlp.data.fields.text_field import TextFieldTensors
 from allennlp.modules.text_field_embedders.text_field_embedder import TextFieldEmbedder
 from allennlp.modules.time_distributed import TimeDistributed
 from allennlp.modules.token_embedders.token_embedder import TokenEmbedder
diff --git a/allennlp/modules/text_field_embedders/text_field_embedder.py b/allennlp/modules/text_field_embedders/text_field_embedder.py
index fb9db2488c5..0ae8ceac555 100644
--- a/allennlp/modules/text_field_embedders/text_field_embedder.py
+++ b/allennlp/modules/text_field_embedders/text_field_embedder.py
@@ -1,7 +1,7 @@
 import torch
 
-from allennlp.common import Registrable
-from allennlp.data import TextFieldTensors
+from allennlp.common.registrable import Registrable
+from allennlp.data.fields.text_field import TextFieldTensors
 
 
 class TextFieldEmbedder(torch.nn.Module, Registrable):
diff --git a/allennlp/modules/time_distributed.py b/allennlp/modules/time_distributed.py
index 48dcade88fc..b8d8582f37e 100644
--- a/allennlp/modules/time_distributed.py
+++ b/allennlp/modules/time_distributed.py
@@ -56,8 +56,18 @@ def forward(self, *inputs, pass_through: List[str] = None, **kwargs):
 
         # Now get the output back into the right shape.
         # (batch_size, time_steps, **output_size)
-        new_size = some_input.size()[:2] + reshaped_outputs.size()[1:]
-        outputs = reshaped_outputs.contiguous().view(new_size)
+        tuple_output = True
+        if not isinstance(reshaped_outputs, tuple):
+            tuple_output = False
+            reshaped_outputs = (reshaped_outputs,)
+
+        outputs = []
+        for reshaped_output in reshaped_outputs:
+            new_size = some_input.size()[:2] + reshaped_output.size()[1:]
+            outputs.append(reshaped_output.contiguous().view(new_size))
+
+        if not tuple_output:
+            outputs = outputs[0]
 
         return outputs
 
diff --git a/allennlp/modules/token_embedders/bag_of_word_counts_token_embedder.py b/allennlp/modules/token_embedders/bag_of_word_counts_token_embedder.py
index ebc18522011..e94d2252a01 100644
--- a/allennlp/modules/token_embedders/bag_of_word_counts_token_embedder.py
+++ b/allennlp/modules/token_embedders/bag_of_word_counts_token_embedder.py
@@ -1,7 +1,7 @@
 import torch
 
 from allennlp.common.checks import ConfigurationError
-from allennlp.data import Vocabulary
+from allennlp.data.vocabulary import Vocabulary
 from allennlp.modules.token_embedders.token_embedder import TokenEmbedder
 from allennlp.nn.util import get_text_field_mask
 
diff --git a/allennlp/modules/token_embedders/embedding.py b/allennlp/modules/token_embedders/embedding.py
index 03f8e4f40fa..a68bdf8c4a1 100644
--- a/allennlp/modules/token_embedders/embedding.py
+++ b/allennlp/modules/token_embedders/embedding.py
@@ -15,7 +15,7 @@
 from allennlp.common import Tqdm
 from allennlp.common.checks import ConfigurationError
 from allennlp.common.file_utils import cached_path, get_file_extension, is_url_or_existing_file
-from allennlp.data import Vocabulary
+from allennlp.data.vocabulary import Vocabulary
 from allennlp.modules.time_distributed import TimeDistributed
 from allennlp.modules.token_embedders.token_embedder import TokenEmbedder
 from allennlp.nn import util
diff --git a/allennlp/modules/transformer/__init__.py b/allennlp/modules/transformer/__init__.py
new file mode 100644
index 00000000000..f346ace8360
--- /dev/null
+++ b/allennlp/modules/transformer/__init__.py
@@ -0,0 +1,142 @@
+"""
+The transformer toolkit provides a set of reusable modules that can be used to experiment
+with transformer architectures. It also simplifies the way one can take apart
+the pretrained transformer weights from an existing model, and plug them in a new architecture.
+
+Examples:
+
+1. Create a small transformer that uses GLoVE embeddings.
+
+```
+embedding_file = str(self.FIXTURES_ROOT / "embeddings/glove.6B.300d.sample.txt.gz")
+
+class TinyTransformer(TokenEmbedder):
+    def __init__(self, vocab, embedding_dim, hidden_size, intermediate_size):
+        super().__init__()
+        self.embeddings = Embedding(
+            pretrained_file=embedding_file,
+            embedding_dim=embedding_dim,
+            projection_dim=hidden_size,
+            vocab=vocab,
+        )
+
+        self.transformer = TransformerStack(
+            num_hidden_layers=4,
+            hidden_size=hidden_size,
+            intermediate_size=intermediate_size,
+        )
+
+    @overrides
+    def forward(self, token_ids: torch.LongTensor):
+        x = self.embeddings(token_ids)
+        x = self.transformer(x)
+        return x
+
+tiny = TinyTransformer(self.vocab, embedding_dim=300, hidden_size=80, intermediate_size=40)
+```
+
+2. Use the first 4 layers of `bert-base-uncased`.
+
+```
+pretrained = cached_transformers.get("bert-base-uncased", False)
+
+class SmallTransformer(TokenEmbedder):
+    def __init__(self):
+        super().__init__()
+        self.embeddings = TransformerEmbeddings.from_pretrained_module(pretrained)
+
+        self.transformer = TransformerStack.from_pretrained_module(
+            pretrained, num_hidden_layers=4
+        )
+
+    @overrides
+    def forward(self, token_ids: torch.LongTensor):
+        x = self.embeddings(token_ids)
+        x = self.transformer(x)
+        return x
+
+small = SmallTransformer()
+assert len(small.transformer.layers) == 4
+small.forward(torch.LongTensor([[0, 1, 2]]))
+```
+
+3. Use the first 8 layers of `bert-base-uncased` to separately encode two text inputs, combine the representations,
+and use the last 4 layers on the combined representation.
+
+```
+class MediumTransformer(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.embeddings = TransformerEmbeddings.from_pretrained_module("bert-base-uncased")
+        self.separate_transformer = TransformerStack.from_pretrained_module(
+            "bert-base-uncased", num_hidden_layers=range(0, 8)
+        )
+        self.combined_transformer = TransformerStack.from_pretrained_module(
+            "bert-base-uncased",
+            num_hidden_layers=range(8, 12),
+        )
+
+    @overrides
+    def forward(
+        self,
+        left_token_ids: torch.LongTensor,
+        right_token_ids: torch.LongTensor,
+    ):
+
+        left = self.embeddings(left_token_ids)
+        left = self.separate_transformer(left)
+
+        right = self.embeddings(right_token_ids)
+        right = self.separate_transformer(right)
+
+        # combine the sequences in some meaningful way.
+        # Here, we just add them for simplicity. In reality,
+        # concatenation may be a better option.
+        combined = left + right
+
+        return self.combined_transformer(combined)
+
+medium = MediumTransformer()
+assert (len(medium.separate_transformer.layers)) == 8
+assert (len(medium.combined_transformer.layers)) == 4
+```
+
+4. Combine different flavors of BERT.
+
+```
+# Regular BERT, but with AlBERT's special compressed embedding scheme
+class AlmostRegularTransformer(TokenEmbedder):
+    def __init__(self):
+        super().__init__()
+        self.embeddings = TransformerEmbeddings.get_relevant_module("albert-base-v2")
+        self.transformer = TransformerStack.from_pretrained_module("bert-base-uncased")
+        # We want to tune only the embeddings, because that's our experiment.
+        self.transformer.requires_grad = False
+
+    @overrides
+    def forward(self, token_ids: torch.LongTensor, mask: torch.BoolTensor):
+        x = self.embeddings(token_ids, mask)
+        x = self.transformer(x)
+        return x
+
+almost = AlmostRegularTransformer()
+```
+"""
+
+from allennlp.modules.transformer.positional_encoding import SinusoidalPositionalEncoding
+
+from allennlp.modules.transformer.transformer_module import TransformerModule
+from allennlp.modules.transformer.transformer_embeddings import (
+    Embeddings,
+    TransformerEmbeddings,
+    ImageFeatureEmbeddings,
+)
+from allennlp.modules.transformer.self_attention import SelfAttention
+from allennlp.modules.transformer.activation_layer import ActivationLayer
+from allennlp.modules.transformer.transformer_layer import AttentionLayer, TransformerLayer
+from allennlp.modules.transformer.transformer_stack import TransformerStack
+from allennlp.modules.transformer.transformer_pooler import TransformerPooler
+from allennlp.modules.transformer.output_layer import OutputLayer
+
+from allennlp.modules.transformer.bimodal_attention import BiModalAttention
+from allennlp.modules.transformer.bimodal_encoder import BiModalEncoder
diff --git a/allennlp/modules/transformer/activation_layer.py b/allennlp/modules/transformer/activation_layer.py
new file mode 100644
index 00000000000..f5ffc4cb58f
--- /dev/null
+++ b/allennlp/modules/transformer/activation_layer.py
@@ -0,0 +1,32 @@
+from typing import Union
+import torch
+
+from allennlp.common import FromParams
+
+from allennlp.modules.transformer.transformer_module import TransformerModule
+
+from transformers.models.bert.modeling_bert import ACT2FN
+
+
+class ActivationLayer(TransformerModule, FromParams):
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        activation: Union[str, torch.nn.Module],
+        pool: bool = False,
+    ):
+        super().__init__()
+        self.dense = torch.nn.Linear(hidden_size, intermediate_size)
+        if isinstance(activation, str):
+            self.act_fn = ACT2FN[activation]
+        else:
+            self.act_fn = activation
+        self.pool = pool
+
+    def forward(self, hidden_states):
+        if self.pool:
+            hidden_states = hidden_states[:, 0]
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.act_fn(hidden_states)
+        return hidden_states
diff --git a/allennlp/modules/transformer/bimodal_attention.py b/allennlp/modules/transformer/bimodal_attention.py
new file mode 100644
index 00000000000..fc6bb4047f9
--- /dev/null
+++ b/allennlp/modules/transformer/bimodal_attention.py
@@ -0,0 +1,205 @@
+import torch
+
+from allennlp.common import FromParams
+from allennlp.modules.attention import Attention
+from allennlp.modules.transformer.transformer_module import TransformerModule
+from allennlp.modules.transformer.util import apply_mask
+
+
+class BiModalAttention(TransformerModule, FromParams):
+    """
+    Computes attention for two modalities, based on
+    [ViLBERT: Pretraining Task-Agnostic Visiolinguistic Representations
+    for Vision-and-Language Tasks (Lu et al, 2019)]
+    (https://api.semanticscholar.org/CorpusID:199453025).
+
+    From the paper:
+
+    "The keys and values from each modality are passed as input to the
+    other modality’s multi-headed attention block. Consequentially, the
+    attention block produces attention-pooled features for each modality
+    conditioned on the other."
+
+    For example, considering the case when the first modality is image,
+    and the second modality is language, the module performs
+    "image-conditioned language attention in the visual stream and
+    language-conditioned image attention in the linguistic stream."
+
+    # Parameters
+
+    hidden_size1 : `int`
+        The input hidden dim for the first modality.
+    hidden_size2 : `int`
+        The input hidden dim for the second modality.
+    combined_hidden_size : `int`
+        The output hidden dim for both modalities; it should be a multiple
+        of `num_attention_heads`.
+    num_attention_heads : `int`
+        The number of attention heads.
+    dropout1 : `float` (default = `0.0`)
+        The dropout probability for the first modality stream.
+    dropout2 : `float` (default = `0.0`)
+        The dropout probability for the second modality stream.
+    scoring_func1 : `str` (default = `scaled_dot_product`)
+        The name of the attention-calculating function to be used for the first modality.
+    scoring_func2 : `str` (default = `scaled_dot_product`)
+        The name of the attention-calculating function to be used for the second modality.
+        Eg. `additive`, `linear`, etc. For a complete list, please check :mod:`allennlp.modules.attention`.
+    """
+
+    def __init__(
+        self,
+        hidden_size1: int,
+        hidden_size2: int,
+        combined_hidden_size: int,
+        num_attention_heads: int,
+        dropout1: float = 0.0,
+        dropout2: float = 0.0,
+        scoring_func1: str = "scaled_dot_product",
+        scoring_func2: str = "scaled_dot_product",
+    ):
+        super().__init__()
+        if combined_hidden_size % num_attention_heads != 0:
+            raise ValueError(
+                "The hidden size (%d) is not a multiple of the number of attention "
+                "heads (%d)" % (combined_hidden_size, num_attention_heads)
+            )
+
+        self.num_attention_heads = num_attention_heads
+        self.attention_head_size = int(combined_hidden_size / num_attention_heads)
+
+        # This is basically the `combined_hidden_size`, since we already ensure
+        # that `combined_hidden_size` is divisible by `num_attention_heads`.
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        # First modality:
+
+        self.query1 = torch.nn.Linear(hidden_size1, self.all_head_size)
+        self.key1 = torch.nn.Linear(hidden_size1, self.all_head_size)
+        self.value1 = torch.nn.Linear(hidden_size1, self.all_head_size)
+
+        self.scoring_func1 = scoring_func1
+        if self.scoring_func1 in ["additive", "linear", "bilinear"]:
+            self.attn1 = Attention.by_name(self.scoring_func1)(hidden_size1, hidden_size1)
+        elif self.scoring_func1 == "scaled_dot_product":
+            self.attn1 = Attention.by_name(self.scoring_func1)(self.attention_head_size, False)
+        else:
+            self.attn1 = Attention.by_name(self.scoring_func1)()
+
+        self.dropout1 = torch.nn.Dropout(dropout1)
+
+        # Second modality:
+
+        self.query2 = torch.nn.Linear(hidden_size2, self.all_head_size)
+        self.key2 = torch.nn.Linear(hidden_size2, self.all_head_size)
+        self.value2 = torch.nn.Linear(hidden_size2, self.all_head_size)
+
+        self.scoring_func2 = scoring_func2
+        if self.scoring_func2 in ["additive", "linear", "bilinear"]:
+            self.attn2 = Attention.by_name(self.scoring_func2)(hidden_size2, hidden_size2)
+        elif self.scoring_func2 == "scaled_dot_product":
+            self.attn2 = Attention.by_name(self.scoring_func2)(self.attention_head_size, False)
+        else:
+            self.attn2 = Attention.by_name(self.scoring_func2)()
+
+        self.dropout2 = torch.nn.Dropout(dropout2)
+
+    def _transpose_for_scores(self, x):
+        new_x_shape = x.size()[:-1] + (
+            self.num_attention_heads,
+            self.attention_head_size,
+        )
+        x = x.view(*new_x_shape)
+        return x.permute(0, 2, 1, 3)
+
+    def forward(
+        self,
+        input_tensor1,
+        input_tensor2,
+        attention_mask1=None,
+        attention_mask2=None,
+        co_attention_mask=None,
+        use_co_attention_mask=False,
+    ):
+        """
+        input_tensor1 : `torch.Tensor`
+            Shape `batch_size x seq_len1 x hidden_dim1`
+            where `seq_len1` can be the sequence length
+            when the modality is text, or the number of
+            regions when the modality is image.
+        input_tensor2 : `torch.Tensor`
+            Shape `batch_size x seq_len2 x hidden_dim2`
+            where `seq_len2` can be the sequence length
+            when the modality is text, or the number of
+            regions when the modality is image.
+        attention_mask1 : `torch.BoolTensor`, optional
+            Shape `batch_size x seq_len1`
+        attention_mask : `torch.BoolTensor`, optional
+            Shape `batch_size x seq_len2`
+        co_attention_mask : `torch.Tensor`, optional
+            Shape `batch_size x seq_len1 x seq_len2 x all_head_size`
+            This mask is for cases when you already have some prior information
+            about the interaction between the two modalities. For example,
+            if you know which words correspond to which regions in the image,
+            this mask can be applied to limit the attention given the bias.
+        use_co_attention_mask : `bool`
+            # TODO: is this flag necessary?
+            Whether to use co_attention_mask or not, default = `False`.
+        """
+
+        # for the first modality:
+        mixed_query_layer1 = self.query1(input_tensor1)
+        mixed_key_layer1 = self.key1(input_tensor1)
+        mixed_value_layer1 = self.value1(input_tensor1)
+
+        query_layer1 = self._transpose_for_scores(mixed_query_layer1)
+        key_layer1 = self._transpose_for_scores(mixed_key_layer1)
+        value_layer1 = self._transpose_for_scores(mixed_value_layer1)
+
+        # for the second modality:
+        mixed_query_layer2 = self.query2(input_tensor2)
+        mixed_key_layer2 = self.key2(input_tensor2)
+        mixed_value_layer2 = self.value2(input_tensor2)
+
+        query_layer2 = self._transpose_for_scores(mixed_query_layer2)
+        key_layer2 = self._transpose_for_scores(mixed_key_layer2)
+        value_layer2 = self._transpose_for_scores(mixed_value_layer2)
+
+        # Conditioning the second modality on the first one.
+        attention_scores1 = self.attn1(query_layer2, key_layer1.transpose(-1, -2))
+        if attention_mask1 is not None:
+            attention_scores1 = apply_mask(attention_scores1, attention_mask1)
+        if use_co_attention_mask:
+            attention_scores1 = apply_mask(attention_scores1, co_attention_mask.permute(0, 1, 3, 2))
+
+        attention_probs1 = torch.nn.Softmax(dim=-1)(attention_scores1)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs1 = self.dropout1(attention_probs1)
+
+        context_layer1 = torch.matmul(attention_probs1, value_layer1)
+        context_layer1 = context_layer1.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape1 = context_layer1.size()[:-2] + (self.all_head_size,)
+        context_layer1 = context_layer1.view(*new_context_layer_shape1)
+
+        # Conditioning the first modality on the second one.
+        attention_scores2 = self.attn2(query_layer1, key_layer2.transpose(-1, -2))
+        # we can comment this line for single flow.
+        if attention_mask2 is not None:
+            attention_scores2 = apply_mask(attention_scores2, attention_mask2)
+        if use_co_attention_mask:
+            attention_scores2 = apply_mask(attention_scores2, co_attention_mask)
+
+        attention_probs2 = torch.nn.Softmax(dim=-1)(attention_scores2)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs2 = self.dropout2(attention_probs2)
+
+        context_layer2 = torch.matmul(attention_probs2, value_layer2)
+        context_layer2 = context_layer2.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape2 = context_layer2.size()[:-2] + (self.all_head_size,)
+        context_layer2 = context_layer2.view(*new_context_layer_shape2)
+
+        return context_layer1, context_layer2
diff --git a/allennlp/modules/transformer/bimodal_connection_layer.py b/allennlp/modules/transformer/bimodal_connection_layer.py
new file mode 100644
index 00000000000..5d7e4f7fc88
--- /dev/null
+++ b/allennlp/modules/transformer/bimodal_connection_layer.py
@@ -0,0 +1,117 @@
+from allennlp.common import FromParams
+
+from allennlp.modules.transformer.activation_layer import ActivationLayer
+from allennlp.modules.transformer.output_layer import OutputLayer
+from allennlp.modules.transformer.bimodal_attention import BiModalAttention
+
+from allennlp.modules.transformer.transformer_module import TransformerModule
+
+
+class BiModalOutput(TransformerModule, FromParams):
+    def __init__(
+        self,
+        hidden_size1: int,
+        hidden_size2: int,
+        combined_hidden_size: int,
+        dropout1: float,
+        dropout2: float,
+    ):
+        super().__init__()
+
+        self.bert_output1 = OutputLayer(combined_hidden_size, hidden_size1, dropout1)
+        self.bert_output2 = OutputLayer(combined_hidden_size, hidden_size2, dropout2)
+
+    def forward(self, hidden_states1, input_tensor1, hidden_states2, input_tensor2):
+
+        hidden_states1 = self.bert_output1(hidden_states1, input_tensor1)
+        hidden_states2 = self.bert_output2(hidden_states2, input_tensor2)
+
+        return hidden_states1, hidden_states2
+
+
+class BiModalConnectionLayer(TransformerModule, FromParams):
+
+    _huggingface_mapping = {"biAttention": "bimodal_attention", "biOutput": "bimodal_output"}
+
+    def __init__(
+        self,
+        hidden_size1: int,
+        hidden_size2: int,
+        combined_hidden_size: int,
+        intermediate_size1: int,
+        intermediate_size2: int,
+        num_attention_heads: int,
+        dropout1: float,
+        dropout2: float,
+        activation: str,
+    ):
+        super().__init__()
+        self.bimodal_attention = BiModalAttention(
+            hidden_size1=hidden_size1,
+            hidden_size2=hidden_size2,
+            combined_hidden_size=combined_hidden_size,
+            num_attention_heads=num_attention_heads,
+            dropout1=dropout1,
+            dropout2=dropout2,
+        )
+
+        self.bimodal_output = BiModalOutput(
+            hidden_size1=hidden_size1,
+            hidden_size2=hidden_size2,
+            combined_hidden_size=combined_hidden_size,
+            dropout1=dropout1,
+            dropout2=dropout2,
+        )
+
+        self.intermediate1 = ActivationLayer(
+            hidden_size=hidden_size1,
+            intermediate_size=intermediate_size1,
+            activation=activation,
+        )
+        self.output1 = OutputLayer(
+            hidden_size=hidden_size1,
+            input_size=intermediate_size1,
+            dropout=dropout1,
+        )
+
+        self.intermediate2 = ActivationLayer(
+            hidden_size=hidden_size2,
+            intermediate_size=intermediate_size2,
+            activation=activation,
+        )
+        self.output2 = OutputLayer(
+            hidden_size=hidden_size2,
+            input_size=intermediate_size2,
+            dropout=dropout2,
+        )
+
+    def forward(
+        self,
+        input_tensor1,
+        attention_mask1,
+        input_tensor2,
+        attention_mask2,
+        co_attention_mask=None,
+        use_co_attention_mask=False,
+    ):
+
+        bi_output1, bi_output2 = self.bimodal_attention(
+            input_tensor1,
+            input_tensor2,
+            attention_mask1,
+            attention_mask2,
+            co_attention_mask,
+            use_co_attention_mask,
+        )
+
+        attention_output1, attention_output2 = self.bimodal_output(
+            bi_output2, input_tensor1, bi_output1, input_tensor2
+        )
+
+        intermediate_output1 = self.intermediate1(attention_output1)
+        layer_output1 = self.output1(intermediate_output1, attention_output1)
+
+        intermediate_output2 = self.intermediate2(attention_output2)
+        layer_output2 = self.output2(intermediate_output2, attention_output2)
+
+        return layer_output1, layer_output2
diff --git a/allennlp/modules/transformer/bimodal_encoder.py b/allennlp/modules/transformer/bimodal_encoder.py
new file mode 100644
index 00000000000..bf5e732e96d
--- /dev/null
+++ b/allennlp/modules/transformer/bimodal_encoder.py
@@ -0,0 +1,335 @@
+from typing import Optional, Dict, List, Union
+import torch
+
+from allennlp.common import FromParams
+
+from allennlp.modules.util import replicate_layers
+
+from allennlp.modules.transformer.transformer_layer import TransformerLayer
+from allennlp.modules.transformer.bimodal_connection_layer import BiModalConnectionLayer
+from allennlp.modules.transformer.transformer_module import TransformerModule
+
+
+class BiModalEncoder(TransformerModule, FromParams):
+    """
+    This module encodes two modalities separately, and performs bi-directional
+    attention using a connection layer. It is based on the modified BertEncoder in
+    the paper: [ViLBERT: Pretraining Task-Agnostic Visiolinguistic Representations
+    for Vision-and-Language Tasks](https://api.semanticscholar.org/CorpusID:199453025)
+
+    # Parameters
+
+    num_hidden_layers1: `int` (default = `12`)
+        Number of hidden layers in the transformer block for the first modality.
+    num_hidden_layers2: `int` (default = `12`)
+        Number of hidden layers in the transformer block for the second modality.
+    hidden_size1: `int` (default = `1024`)
+    hidden_size2: `int` (default = `1024`)
+    combined_hidden_size: `int` (default = `1024`)
+        Hidden size for the connection layer.
+    intermediate_size1: `int` (default = `1024`)
+    intermediate_size2: `int` (default = `1024`)
+    num_attention_heads1: `int` (default = `8`)
+    num_attention_heads2: `int` (default = `8`)
+    combined_num_attention_heads: `int` (default = `8`)
+        Number of attention heads in the connection layer.
+    attention_dropout1: `float` (default = `0.1`)
+    hidden_dropout1: `float` (default = `0.1`)
+    attention_dropout2: `float` (default = `0.1`)
+    hidden_dropout2: `float` (default = `0.1`)
+    biattention_id1: `List`, optional (default = `[1]`)
+    biattention_id2: `List`, optional (default = `[1]`)
+    fixed_layer1: `int` (default = `0`)
+    fixed_layer2: `int` (default = `0`)
+    fast_mode: `bool` (default = `False`)
+    with_coattention: `bool` (default = `True`)
+    in_batch_pairs: `bool` (default = `False`)
+    """
+
+    _huggingface_mapping = {"layer": "layers1"}
+    _relevant_module = "encoder"
+
+    def __init__(
+        self,
+        num_hidden_layers1: int = 12,
+        num_hidden_layers2: int = 12,
+        hidden_size1: int = 1024,
+        hidden_size2: int = 1024,
+        combined_hidden_size: int = 1024,
+        intermediate_size1: int = 1024,
+        intermediate_size2: int = 1024,
+        num_attention_heads1: int = 8,
+        num_attention_heads2: int = 8,
+        combined_num_attention_heads: int = 8,
+        attention_dropout1: float = 0.1,
+        hidden_dropout1: float = 0.1,
+        attention_dropout2: float = 0.1,
+        hidden_dropout2: float = 0.1,
+        activation: str = "relu",
+        biattention_id1: Optional[List[int]] = None,
+        biattention_id2: Optional[List[int]] = None,
+        fixed_layer1: int = 0,
+        fixed_layer2: int = 0,
+        fast_mode: bool = False,
+        with_coattention: bool = True,
+        in_batch_pairs: bool = False,
+    ):
+        super().__init__()
+
+        self.FAST_MODE = fast_mode
+        self.with_coattention = with_coattention
+        self.biattention_id1 = biattention_id1 or [1]
+        self.biattention_id2 = biattention_id2 or [1]
+        self.in_batch_pairs = in_batch_pairs
+        self.fixed_layer1 = fixed_layer1
+        self.fixed_layer2 = fixed_layer2
+        self.combined_size = combined_hidden_size
+        self.hidden_size1 = hidden_size1
+        self.hidden_size2 = hidden_size2
+
+        layer1 = TransformerLayer(
+            hidden_size=hidden_size1,
+            intermediate_size=intermediate_size1,
+            num_attention_heads=num_attention_heads1,
+            attention_dropout=attention_dropout1,
+            hidden_dropout=hidden_dropout1,
+            activation=activation,
+        )
+        layer2 = TransformerLayer(
+            hidden_size=hidden_size2,
+            intermediate_size=intermediate_size2,
+            num_attention_heads=num_attention_heads2,
+            attention_dropout=attention_dropout2,
+            hidden_dropout=hidden_dropout2,
+            activation=activation,
+        )
+        connect_layer = BiModalConnectionLayer(
+            hidden_size1=hidden_size1,
+            hidden_size2=hidden_size2,
+            combined_hidden_size=combined_hidden_size,
+            intermediate_size1=intermediate_size1,
+            intermediate_size2=intermediate_size2,
+            num_attention_heads=combined_num_attention_heads,
+            dropout1=hidden_dropout1,
+            dropout2=hidden_dropout2,
+            activation=activation,
+        )
+
+        self.layers1 = replicate_layers(layer1, num_hidden_layers1)
+        self.layers2 = replicate_layers(layer2, num_hidden_layers2)
+        self.c_layer = replicate_layers(connect_layer, len(self.biattention_id2))
+
+    def forward(
+        self,
+        embedding1,
+        embedding2,
+        attention_mask1,
+        attention_mask2,
+        co_attention_mask=None,
+        output_all_encoded_layers=True,
+    ):
+        start1 = 0
+        start2 = 0
+        count = 0
+        all_encoder_layers1 = []
+        all_encoder_layers2 = []
+
+        batch_size, num_words, hidden_size1 = embedding1.size()
+        _, num_regions, hidden_size2 = embedding2.size()
+
+        use_co_attention_mask = False
+        for layer_id2, layer_id1 in zip(self.biattention_id2, self.biattention_id1):
+            end1 = layer_id1
+            end2 = layer_id2
+
+            assert self.fixed_layer1 <= end1
+            assert self.fixed_layer2 <= end2
+
+            for idx in range(start1, self.fixed_layer1):
+                with torch.no_grad():
+                    embedding1 = self.layers1[idx](embedding1, attention_mask1)[0]
+                    start1 = self.fixed_layer1
+
+            for idx in range(start1, end1):
+                embedding1 = self.layers1[idx](embedding1, attention_mask1)[0]
+
+            for idx in range(start2, self.fixed_layer2):
+                with torch.no_grad():
+                    embedding2 = self.layers2[idx](embedding2, attention_mask2)[0]
+                    start2 = self.fixed_layer2
+
+            for idx in range(start2, end2):
+                embedding2 = self.layers2[idx](embedding2, attention_mask2)[0]
+
+            if count == 0 and self.in_batch_pairs:
+                # new batch size is the batch_size ^2
+                embedding2 = (
+                    embedding2.unsqueeze(0)
+                    .expand(batch_size, batch_size, num_regions, hidden_size2)
+                    .contiguous()
+                    .view(batch_size * batch_size, num_regions, hidden_size2)
+                )
+                attention_mask2 = (
+                    attention_mask2.unsqueeze(0)
+                    .expand(batch_size, batch_size, 1, 1, num_regions)
+                    .contiguous()
+                    .view(batch_size * batch_size, 1, 1, num_regions)
+                )
+
+                embedding1 = (
+                    embedding1.unsqueeze(1)
+                    .expand(batch_size, batch_size, num_words, hidden_size1)
+                    .contiguous()
+                    .view(batch_size * batch_size, num_words, hidden_size1)
+                )
+                attention_mask1 = (
+                    attention_mask1.unsqueeze(1)
+                    .expand(batch_size, batch_size, 1, 1, num_words)
+                    .contiguous()
+                    .view(batch_size * batch_size, 1, 1, num_words)
+                )
+                co_attention_mask = (
+                    co_attention_mask.unsqueeze(1)
+                    .expand(batch_size, batch_size, 1, num_regions, num_words)
+                    .contiguous()
+                    .view(batch_size * batch_size, 1, num_regions, num_words)
+                )
+
+            if count == 0 and self.FAST_MODE:
+                embedding1 = embedding1.expand(
+                    embedding2.size(0),
+                    embedding1.size(1),
+                    embedding1.size(2),
+                )
+                attention_mask1 = attention_mask1.expand(
+                    embedding2.size(0),
+                    attention_mask1.size(1),
+                    attention_mask1.size(2),
+                    attention_mask1.size(3),
+                )
+
+            if self.with_coattention:
+                embedding1, embedding2 = self.c_layer[count](
+                    embedding1,
+                    attention_mask1,
+                    embedding2,
+                    attention_mask2,
+                    co_attention_mask,
+                    use_co_attention_mask,
+                )
+
+            start2 = end2
+            start1 = end1
+            count += 1
+
+            if output_all_encoded_layers:
+                all_encoder_layers1.append(embedding1)
+                all_encoder_layers2.append(embedding2)
+
+        for idx in range(start2, len(self.layers2)):
+            embedding2 = self.layers2[idx](embedding2, attention_mask2)[0]
+
+        for idx in range(start1, len(self.layers1)):
+            embedding1 = self.layers1[idx](embedding1, attention_mask1)[0]
+
+        # add the end part to finish.
+        if not output_all_encoded_layers:
+            all_encoder_layers1.append(embedding1)
+            all_encoder_layers2.append(embedding2)
+
+        return (
+            torch.stack(all_encoder_layers1, dim=-1),
+            torch.stack(all_encoder_layers2, dim=-1),
+        )
+
+    @classmethod
+    def _get_input_arguments(
+        cls,
+        pretrained_module: torch.nn.Module,
+        source="huggingface",
+        mapping: Optional[Dict[str, str]] = None,
+        **kwargs,
+    ):
+        """
+        The `pretrained_module` only supplies one of the modalities.
+        """
+        submodules = cls._get_mapped_submodules(pretrained_module, source, mapping)
+
+        final_kwargs = {}
+
+        final_kwargs["num_hidden_layers1"] = len(submodules["layers1"])
+
+        final_kwargs["hidden_size1"] = submodules["layers1.0.attention.self.query"].in_features
+        final_kwargs["num_attention_heads1"] = submodules[
+            "layers1.0.attention.self"
+        ].num_attention_heads
+        final_kwargs["attention_dropout1"] = submodules["layers1.0.attention.self.dropout"].p
+        final_kwargs["hidden_dropout1"] = submodules["layers1.0.attention.output.dropout"].p
+        final_kwargs["intermediate_size1"] = submodules["layers1.0.intermediate.dense"].out_features
+        final_kwargs["activation"] = submodules["layers1.0.intermediate"].intermediate_act_fn
+
+        final_kwargs.update(**kwargs)
+
+        return final_kwargs
+
+    def _load_from_pretrained_module(
+        self,
+        pretrained_module: torch.nn.Module,
+        source="huggingface",
+        mapping: Optional[Dict[str, str]] = None,
+        ignore_absent_parameters: Optional[List] = None,
+    ):
+        if source == "huggingface":
+            ignore_absent_parameters = ["layers2", "c_layer"]
+        super()._load_from_pretrained_module(
+            pretrained_module, source, mapping, ignore_absent_parameters
+        )
+
+    @classmethod
+    def from_pretrained_module(  # type: ignore
+        cls,
+        pretrained_module: Union[str, torch.nn.Module],
+        num_hidden_layers2: int,
+        hidden_size2: int,
+        combined_hidden_size: int,
+        intermediate_size2: int,
+        num_attention_heads2: int,
+        combined_num_attention_heads: int,
+        attention_dropout2: float,
+        hidden_dropout2: float,
+        biattention_id1: List[int],
+        biattention_id2: List[int],
+        fixed_layer1: int,
+        fixed_layer2: int,
+        fast_mode: bool = False,
+        with_coattention: bool = True,
+        in_batch_pairs: bool = False,
+        source="huggingface",
+        mapping: Optional[Dict[str, str]] = None,
+        # **kwargs,
+    ):
+        """
+        The `pretrained_module` only supplies one of the modalities.
+        """
+        pretrained_module = cls.get_relevant_module(
+            pretrained_module, source=source, mapping=mapping
+        )
+        final_kwargs = {}
+        final_kwargs.update(cls._get_input_arguments(pretrained_module, source, mapping))
+        final_kwargs["num_hidden_layers2"] = num_hidden_layers2
+        final_kwargs["hidden_size2"] = hidden_size2
+        final_kwargs["combined_hidden_size"] = combined_hidden_size
+        final_kwargs["intermediate_size2"] = intermediate_size2
+        final_kwargs["num_attention_heads2"] = num_attention_heads2
+        final_kwargs["combined_num_attention_heads"] = combined_num_attention_heads
+        final_kwargs["attention_dropout2"] = attention_dropout2
+        final_kwargs["hidden_dropout2"] = hidden_dropout2
+        final_kwargs["biattention_id1"] = biattention_id1
+        final_kwargs["biattention_id2"] = biattention_id2
+        final_kwargs["fixed_layer1"] = fixed_layer1
+        final_kwargs["fixed_layer2"] = fixed_layer2
+        final_kwargs["fast_mode"] = fast_mode
+        final_kwargs["with_coattention"] = with_coattention
+        final_kwargs["in_batch_pairs"] = in_batch_pairs
+
+        return super().from_pretrained_module(pretrained_module, source, mapping, **final_kwargs)
diff --git a/allennlp/modules/transformer/output_layer.py b/allennlp/modules/transformer/output_layer.py
new file mode 100644
index 00000000000..03dd1f9d5df
--- /dev/null
+++ b/allennlp/modules/transformer/output_layer.py
@@ -0,0 +1,22 @@
+import torch
+
+from allennlp.common import FromParams
+
+from allennlp.modules.transformer.transformer_module import TransformerModule
+
+
+class OutputLayer(TransformerModule, FromParams):
+
+    _huggingface_mapping = {"LayerNorm": "layer_norm"}
+
+    def __init__(self, input_size: int, hidden_size: int, dropout: float):
+        super().__init__()
+        self.dense = torch.nn.Linear(input_size, hidden_size)
+        self.layer_norm = torch.nn.LayerNorm(hidden_size, eps=1e-12)
+        self.dropout = torch.nn.Dropout(dropout)
+
+    def forward(self, hidden_states, input_tensor):
+        dense_output = self.dense(hidden_states)
+        dropout_output = self.dropout(dense_output)
+        output = self.layer_norm(dropout_output + input_tensor)
+        return output
diff --git a/allennlp/modules/transformer/positional_encoding.py b/allennlp/modules/transformer/positional_encoding.py
new file mode 100644
index 00000000000..1cf63b15c91
--- /dev/null
+++ b/allennlp/modules/transformer/positional_encoding.py
@@ -0,0 +1,70 @@
+import math
+import torch
+
+from allennlp.common import FromParams
+from allennlp.nn.util import get_range_vector, get_device_of
+
+
+class SinusoidalPositionalEncoding(torch.nn.Module, FromParams):
+    """
+    Implements the frequency-based positional encoding described
+    in [Attention is All you Need][0].
+
+    Adds sinusoids of different frequencies to a `Tensor`. A sinusoid of a
+    different frequency and phase is added to each dimension of the input `Tensor`.
+    This allows the attention heads to use absolute and relative positions.
+
+    The number of timescales is equal to hidden_dim / 2 within the range
+    (min_timescale, max_timescale). For each timescale, the two sinusoidal
+    signals sin(timestep / timescale) and cos(timestep / timescale) are
+    generated and concatenated along the hidden_dim dimension.
+
+    [0]: https://www.semanticscholar.org/paper/Attention-Is-All-You-Need-Vaswani-Shazeer/0737da0767d77606169cbf4187b83e1ab62f6077
+
+    # Parameters
+
+    tensor : `torch.Tensor`
+        a Tensor with shape (batch_size, timesteps, hidden_dim).
+    min_timescale : `float`, optional (default = `1.0`)
+        The smallest timescale to use.
+    max_timescale : `float`, optional (default = `1.0e4`)
+        The largest timescale to use.
+
+    # Returns
+
+    `torch.Tensor`
+        The input tensor augmented with the sinusoidal frequencies.
+    """  # noqa
+
+    def __init__(self, min_timescale: float = 1.0, max_timescale: float = 1.0e4):
+        super().__init__()
+        self.min_timescale = min_timescale
+        self.max_timescale = max_timescale
+
+    def forward(self, input_tensor: torch.Tensor):
+        # TODO: Another option is to specify the expected size in init, so that we can construct
+        # the positional encoding beforehand, and simply add it to the input tensor in forward.
+        _, timesteps, hidden_dim = input_tensor.size()
+        num_timescales = hidden_dim // 2
+        device = get_device_of(input_tensor)
+
+        timestep_range = get_range_vector(timesteps, device).data.float()
+        timescale_range = get_range_vector(num_timescales, device).data.float()
+
+        log_timescale_increments = math.log(
+            float(self.max_timescale) / float(self.min_timescale)
+        ) / float(num_timescales - 1)
+        inverse_timescales = self.min_timescale * torch.exp(
+            timescale_range * -log_timescale_increments
+        )
+
+        # Broadcasted multiplication - shape (timesteps, num_timescales)
+        scaled_time = timestep_range.unsqueeze(1) * inverse_timescales.unsqueeze(0)
+        # shape (timesteps, 2 * num_timescales)
+        sinusoids = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], 1)
+        if hidden_dim % 2 != 0:
+            # if the number of dimensions is odd, the cos and sin
+            # timescales had size (hidden_dim - 1) / 2, so we need
+            # to add a row of zeros to make up the difference.
+            sinusoids = torch.cat([sinusoids, sinusoids.new_zeros(timesteps, 1)], 1)
+        return input_tensor + sinusoids.unsqueeze(0)
diff --git a/allennlp/modules/transformer/self_attention.py b/allennlp/modules/transformer/self_attention.py
new file mode 100644
index 00000000000..6db6aba1fad
--- /dev/null
+++ b/allennlp/modules/transformer/self_attention.py
@@ -0,0 +1,179 @@
+from typing import Optional, Dict
+import torch
+
+from allennlp.common import FromParams
+from allennlp.modules.attention import Attention
+from allennlp.modules.transformer.transformer_module import TransformerModule
+from allennlp.modules.transformer.util import apply_mask
+
+
+class SelfAttention(TransformerModule, FromParams):
+    """
+    This module computes the self-attention, similar to the architecture in BERT. Additionally, the attention
+    scoring function can be specified.
+    Details in the paper:
+    [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding, Devlin et al, 2019]
+    (https://api.semanticscholar.org/CorpusID:52967399)
+
+    # Parameters
+
+    hidden_size: `int`
+    num_attention_heads: `int`
+    dropout: `float` (default = `0.0`)
+    scoring_func: `str` (default = `scaled_dot_product`)
+        The name of the attention-calculating function to be used.
+        Eg. `additive`, `linear`, etc. For a complete list, please check :mod:`allennlp.modules.attention`.
+    """
+
+    _relevant_module = ["encoder.layers.0.attention.self", "encoder.layers.0.attention"]
+    _huggingface_mapping = {"layer": "layers"}
+
+    def __init__(
+        self,
+        hidden_size: int,
+        num_attention_heads: int,
+        dropout: float = 0.0,
+        scoring_func: str = "scaled_dot_product",
+        output_linear: bool = False,
+    ):
+        super().__init__()
+        if hidden_size % num_attention_heads != 0:
+            raise ValueError(
+                "The hidden size (%d) is not a multiple of the number of attention "
+                "heads (%d)" % (hidden_size, num_attention_heads)
+            )
+        self.hidden_size = hidden_size
+        self.num_attention_heads = num_attention_heads
+        self.attention_head_size = int(hidden_size / num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.query = torch.nn.Linear(hidden_size, self.all_head_size)
+        self.key = torch.nn.Linear(hidden_size, self.all_head_size)
+        self.value = torch.nn.Linear(hidden_size, self.all_head_size)
+
+        self.scoring_func = scoring_func
+        if self.scoring_func in ["additive", "linear", "bilinear"]:
+            self.attn = Attention.by_name(self.scoring_func)(hidden_size, hidden_size)
+        elif self.scoring_func == "scaled_dot_product":
+            self.attn = Attention.by_name(self.scoring_func)(self.attention_head_size, False)
+        else:
+            self.attn = Attention.by_name(self.scoring_func)()
+
+        # out linear layer for distilbert.
+        if output_linear:
+            self.output = torch.nn.Linear(hidden_size, self.all_head_size)
+
+        self.dropout = torch.nn.Dropout(dropout)
+
+    def _transpose_for_scores(self, x):
+        new_x_shape = x.size()[:-1] + (
+            self.num_attention_heads,
+            self.attention_head_size,
+        )
+        x = x.view(*new_x_shape)
+        return x.permute(0, 2, 1, 3)
+
+    def forward(
+        self,
+        query_states: torch.Tensor,
+        key_states: Optional[torch.Tensor] = None,
+        value_states: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.BoolTensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ):
+        """
+        query_states : `torch.Tensor`
+            Shape `batch_size x seq_len x hidden_dim`
+        key_states : `torch.Tensor`, optional
+            Shape `batch_size x seq_len x hidden_dim`
+        value_states : `torch.Tensor`, optional
+            Shape `batch_size x seq_len x hidden_dim`
+        attention_mask : `torch.BoolTensor`, optional
+            Shape `batch_size x seq_len`
+        head_mask : `torch.BoolTensor`, optional
+        output_attentions : `bool`
+            Whether to also return the attention probabilities, default = `False`
+        """
+        if key_states is None:
+            key_states = query_states
+        if value_states is None:
+            value_states = query_states
+
+        mixed_query_layer = self.query(query_states)
+        mixed_key_layer = self.key(key_states)
+        mixed_value_layer = self.value(value_states)
+
+        query_layer = self._transpose_for_scores(mixed_query_layer)
+        key_layer = self._transpose_for_scores(mixed_key_layer)
+        value_layer = self._transpose_for_scores(mixed_value_layer)
+
+        attention_scores = self.attn(query_layer, key_layer.transpose(-1, -2))
+
+        if attention_mask is not None:
+            attention_scores = apply_mask(attention_scores, attention_mask)
+
+        attention_probs = torch.nn.Softmax(dim=-1)(attention_scores)
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+
+        if head_mask is not None:
+            attention_probs = attention_probs * head_mask
+
+        context_layer = torch.matmul(attention_probs, value_layer)
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.view(*new_context_layer_shape)
+
+        if hasattr(self, "output"):
+            context_layer = self.output(context_layer)
+
+        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
+        return outputs
+
+    @classmethod
+    def _get_mapping(
+        cls, pretrained_module=None, source="huggingface", mapping: Optional[Dict[str, str]] = None
+    ):
+        combined_mapping = {}
+        if "huggingface" in source:
+            combined_mapping.update(cls._huggingface_mapping)
+        if mapping is not None:
+            combined_mapping.update(mapping)
+        if pretrained_module is not None:
+            for name, _ in pretrained_module.named_modules():
+                if "q_lin" in name:
+                    combined_mapping["q_lin"] = "query"
+                    combined_mapping["k_lin"] = "key"
+                    combined_mapping["v_lin"] = "value"
+                    combined_mapping["out_lin"] = "output"
+                    combined_mapping["transformer"] = "encoder"
+                    break
+        return combined_mapping
+
+    @classmethod
+    def _get_input_arguments(
+        cls,
+        pretrained_module: torch.nn.Module,
+        source="huggingface",
+        mapping: Optional[Dict[str, str]] = None,
+        **kwargs,
+    ):
+        submodules = cls._get_mapped_submodules(pretrained_module, source, mapping)
+        final_kwargs = {}
+
+        final_kwargs["hidden_size"] = submodules["query"].in_features
+        if hasattr(submodules[""], "num_attention_heads"):
+            final_kwargs["num_attention_heads"] = submodules[""].num_attention_heads
+        elif hasattr(submodules[""], "n_heads"):
+            final_kwargs["num_attention_heads"] = submodules[""].n_heads
+            final_kwargs["output_linear"] = True  # Since this is the distilbert case.
+        else:
+            raise AttributeError("Cannot find a relevant attribute for number of heads.")
+
+        final_kwargs["dropout"] = submodules["dropout"].p
+
+        final_kwargs.update(**kwargs)
+
+        return final_kwargs
diff --git a/allennlp/modules/transformer/transformer_embeddings.py b/allennlp/modules/transformer/transformer_embeddings.py
new file mode 100644
index 00000000000..df0e53c4544
--- /dev/null
+++ b/allennlp/modules/transformer/transformer_embeddings.py
@@ -0,0 +1,214 @@
+from typing import Optional, Dict
+
+import torch
+
+from allennlp.common import FromParams
+
+from allennlp.modules.transformer.transformer_module import TransformerModule
+
+
+class Embeddings(TransformerModule, FromParams):
+    """
+    General class for embeddings for any modality.
+
+    # Parameters
+
+    embeddings : `torch.nn.ModuleDict`
+        Named embedding layers. Eg. `"word_embeddings"`, `"position_embeddings"`, etc.
+        All the embedding layers are expected to have different inputs; the output
+        of one will not be passed to the other. All the layers should have the same
+        `embedding_dim`/`out_features`.
+    embedding_size : `int`
+        The `embedding_dim` of all the embedding layers.
+    dropout : `float`
+        The probability of an element to be zeroed.
+    """
+
+    def __init__(self, embeddings: torch.nn.ModuleDict, embedding_size: int, dropout: float):
+        super().__init__()
+        for name, embedding_layer in embeddings.named_children():
+            if isinstance(embedding_layer, torch.nn.Embedding):
+                assert embedding_layer.embedding_dim == embedding_size
+            elif isinstance(embedding_layer, torch.nn.Linear):
+                assert embedding_layer.out_features == embedding_size
+            else:
+                raise TypeError(
+                    'Layer "{}" must be of type `torch.nn.Embedding` or `torch.nn.Linear`.'.format(
+                        name
+                    )
+                )
+        self.embeddings = embeddings
+        self.layer_norm = torch.nn.LayerNorm(embedding_size, eps=1e-12)
+        self.dropout = torch.nn.Dropout(dropout)
+
+    def forward(self, *inputs) -> torch.Tensor:
+        assert len(inputs) == len(self.embeddings)
+        outputs = []
+        for i, layer in enumerate(self.embeddings.children()):
+            outputs.append(layer(inputs[i]))
+
+        outputs = sum(outputs)  # type: ignore
+        outputs = self.layer_norm(outputs)
+        outputs = self.dropout(outputs)
+        return outputs
+
+
+class ImageFeatureEmbeddings(Embeddings):
+    """
+    Embedding module for image features.
+
+    # Parameters
+
+    feature_size : `int`
+        Number of image features.
+    embedding_size : `int`
+        The `embedding_dim` of all the embedding layers.
+    dropout : `float` (default = `0.0`)
+        The probability of an element to be zeroed.
+    """
+
+    def __init__(self, feature_size: int, embedding_size: int, dropout: float = 0.0):
+        image_embeddings = torch.nn.Linear(feature_size, embedding_size)
+        location_embeddings = torch.nn.Linear(4, embedding_size)
+        embeddings = torch.nn.ModuleDict(
+            {"image_embeddings": image_embeddings, "location_embeddings": location_embeddings}
+        )
+        super().__init__(embeddings, embedding_size, dropout)
+
+
+class TransformerEmbeddings(Embeddings):
+    """
+    Construct the embeddings from word, position and token_type embeddings.
+    Details in the paper:
+    [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding, Devlin et al, 2019]
+    (https://api.semanticscholar.org/CorpusID:52967399)
+
+    # Parameters
+
+    vocab_size : `int`
+        The size of the input vocab.
+    embedding_size : `int`
+        The `embedding_dim` of all the embedding layers.
+    pad_token_id : `int` (default = `0`)
+        The token id of the `<pad>` token.
+    max_position_embeddings : `int` (default = `512`)
+        The maximum number of positions.
+    type_vocab_size : `int` (default = `2`)
+        The size of the input token_type vocab.
+    dropout : `int` (default = `0.1`)
+        The probability of an element to be zeroed.
+    output_size : `int`, optional (default = `None`)
+        Optionally apply a linear transform after the dropout, projecting to `output_size`.
+    """
+
+    _relevant_module = "embeddings"
+    _huggingface_mapping = {
+        "LayerNorm": "layer_norm",
+        "word_embeddings": "embeddings.word_embeddings",
+        "position_embeddings": "embeddings.position_embeddings",
+        "token_type_embeddings": "embeddings.token_type_embeddings",
+    }
+
+    def __init__(
+        self,
+        vocab_size: int,
+        embedding_size: int,
+        pad_token_id: int = 0,
+        max_position_embeddings: int = 512,
+        type_vocab_size: int = 2,
+        dropout: float = 0.1,
+        output_size: Optional[int] = None,
+    ):
+
+        embedding_dict = {}
+
+        word_embeddings = torch.nn.Embedding(vocab_size, embedding_size, padding_idx=pad_token_id)
+        embedding_dict["word_embeddings"] = word_embeddings
+
+        position_embeddings = torch.nn.Embedding(max_position_embeddings, embedding_size)
+        embedding_dict["position_embeddings"] = position_embeddings
+
+        if type_vocab_size > 0:
+            token_type_embeddings = torch.nn.Embedding(type_vocab_size, embedding_size)
+            embedding_dict["token_type_embeddings"] = token_type_embeddings
+
+        embeddings = torch.nn.ModuleDict(embedding_dict)
+
+        super().__init__(embeddings, embedding_size, dropout)
+
+        # For Albert, the embedding size is different than the hidden size used
+        # in the model, so a linear transform is applied.
+        if output_size:
+            self.linear_transform = torch.nn.Linear(embedding_size, output_size)
+
+    def forward(  # type: ignore
+        self,
+        input_ids: torch.Tensor,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+
+        """
+        input_ids : `torch.Tensor`
+            Shape `batch_size x seq_len`
+        token_type_ids : `torch.Tensor`, optional
+            Shape `batch_size x seq_len`
+        position_ids : `torch.Tensor`, optional
+            Shape `batch_size x seq_len`
+        """
+
+        input_shape = input_ids.size()
+        device = input_ids.device
+        seq_length = input_shape[1]
+
+        embedding_inputs = [input_ids]
+
+        if position_ids is None:
+            position_ids = torch.arange(seq_length, dtype=torch.long, device=device)
+            position_ids = position_ids.unsqueeze(0).expand(input_shape)
+
+        embedding_inputs.append(position_ids)
+
+        if token_type_ids is None:
+            token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
+
+        if len(self.embeddings) == 3:
+            embedding_inputs.append(token_type_ids)
+
+        embeddings = super().forward(*embedding_inputs)
+
+        if hasattr(self, "linear_transform"):
+            embeddings = self.linear_transform(embeddings)
+
+        return embeddings
+
+    @classmethod
+    def _get_input_arguments(
+        cls,
+        pretrained_module: torch.nn.Module,
+        source="huggingface",
+        mapping: Optional[Dict[str, str]] = None,
+        **kwargs,
+    ):
+        submodules = cls._get_mapped_submodules(pretrained_module, source, mapping)
+
+        final_kwargs = {}
+
+        final_kwargs["vocab_size"] = submodules["embeddings.word_embeddings"].num_embeddings
+        final_kwargs["embedding_size"] = submodules["embeddings.word_embeddings"].embedding_dim
+        final_kwargs["pad_token_id"] = submodules["embeddings.word_embeddings"].padding_idx
+        final_kwargs["max_position_embeddings"] = submodules[
+            "embeddings.position_embeddings"
+        ].num_embeddings
+
+        if "embeddings.token_type_embeddings" in submodules:
+            final_kwargs["type_vocab_size"] = submodules[
+                "embeddings.token_type_embeddings"
+            ].num_embeddings
+
+        else:
+            final_kwargs["type_vocab_size"] = 0
+
+        final_kwargs.update(**kwargs)
+
+        return final_kwargs
diff --git a/allennlp/modules/transformer/transformer_layer.py b/allennlp/modules/transformer/transformer_layer.py
new file mode 100644
index 00000000000..3282b2dbf14
--- /dev/null
+++ b/allennlp/modules/transformer/transformer_layer.py
@@ -0,0 +1,249 @@
+from typing import Union, Optional, Dict
+
+import torch
+
+from allennlp.common import FromParams
+
+from allennlp.modules.transformer.transformer_module import TransformerModule
+
+from allennlp.modules.transformer.activation_layer import ActivationLayer
+from allennlp.modules.transformer.self_attention import SelfAttention
+from allennlp.modules.transformer.output_layer import OutputLayer
+
+
+class AttentionLayer(TransformerModule, FromParams):
+    """
+    This module wraps the self-attention with the output-layer, similar to the architecture in BERT.
+    Details in the paper:
+    [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding, Devlin et al, 2019]
+    (https://api.semanticscholar.org/CorpusID:52967399)
+
+    # Parameters
+
+    hidden_size: `int`
+    num_attention_heads: `int`
+    attention_dropout: `float` (default = `0.0`)
+        Dropout probability for the `SelfAttention` layer.
+    hidden_dropout: `float` (default = `0.0`)
+        Dropout probability for the `OutputLayer`.
+    """
+
+    _relevant_module = "encoder.layers.0.attention"
+    _huggingface_mapping = {"layer": "layers"}
+
+    def __init__(
+        self,
+        hidden_size: int,
+        num_attention_heads: int,
+        attention_dropout: float = 0.0,
+        hidden_dropout: float = 0.0,
+    ):
+        super().__init__()
+        self.self = SelfAttention(hidden_size, num_attention_heads, attention_dropout)
+        self.output = OutputLayer(hidden_size, hidden_size, hidden_dropout)
+
+    def forward(
+        self,
+        input_tensor: torch.Tensor,
+        attention_mask: torch.BoolTensor,
+        head_mask: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.BoolTensor] = None,
+        output_attentions: bool = False,
+    ):
+        """
+        input_tensor : `torch.Tensor`
+            Shape `batch_size x seq_len x hidden_dim`
+        attention_mask : `torch.BoolTensor`, optional
+            Shape `batch_size x seq_len`
+        head_mask : `torch.BoolTensor`, optional
+        output_attentions : `bool`
+            Whether to also return the attention probabilities, default = `False`
+        """
+
+        if encoder_hidden_states is not None:
+            attention_mask = encoder_attention_mask
+
+        self_output = self.self(
+            input_tensor,
+            encoder_hidden_states,
+            encoder_hidden_states,
+            attention_mask,
+            head_mask,
+            output_attentions,
+        )
+        attention_output = self.output(self_output[0], input_tensor)
+        outputs = (attention_output,) + self_output[1:]  # add attentions if we output them
+        return outputs
+
+    @classmethod
+    def _get_input_arguments(
+        cls,
+        pretrained_module: torch.nn.Module,
+        source="huggingface",
+        mapping: Optional[Dict[str, str]] = None,
+        **kwargs,
+    ):
+        submodules = cls._get_mapped_submodules(pretrained_module, source, mapping)
+
+        final_kwargs = {}
+
+        final_kwargs["hidden_size"] = submodules["self.query"].in_features
+        final_kwargs["num_attention_heads"] = submodules["self"].num_attention_heads
+        final_kwargs["attention_dropout"] = submodules["self.dropout"].p
+        final_kwargs["hidden_dropout"] = submodules["output.dropout"].p
+
+        final_kwargs.update(**kwargs)
+
+        return final_kwargs
+
+
+class TransformerLayer(TransformerModule, FromParams):
+    """
+    This module is a single transformer layer, mapping to `BertLayer` in the architecture in BERT.
+    Details in the paper:
+    [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding, Devlin et al, 2019]
+    (https://api.semanticscholar.org/CorpusID:52967399)
+
+    # Parameters
+
+    hidden_size : `int`
+    intermediate_size : `int`
+    num_attention_heads : `int`
+    attention_dropout : `float` (default = `0.0`)
+        Dropout probability for the `SelfAttention` layer.
+    hidden_dropout : `float` (default = `0.0`)
+        Dropout probability for the `OutputLayer`.
+    activation : `Union[str, torch.nn.Module]`
+    add_cross_attention : `bool` (default = `False`)
+        If True, an extra `AttentionLayer` is added for cross-attention.
+        This is helpful when using the layer in a decoder.
+    """
+
+    _relevant_module = "encoder.layers.0"
+    _huggingface_mapping = {
+        "layer": "layers",
+        "intermediate_act_fn": "act_fn",
+        "crossattention": "cross_attention",
+    }
+
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        num_attention_heads: int,
+        attention_dropout: float = 0.0,
+        hidden_dropout: float = 0.0,
+        activation: Union[str, torch.nn.Module] = "relu",
+        add_cross_attention: bool = False,
+    ):
+        super().__init__()
+
+        self._hidden_size = hidden_size
+        self._add_cross_attention = add_cross_attention
+
+        self.attention = AttentionLayer(
+            hidden_size=hidden_size,
+            num_attention_heads=num_attention_heads,
+            attention_dropout=attention_dropout,
+            hidden_dropout=hidden_dropout,
+        )
+
+        if add_cross_attention:
+            self.cross_attention = AttentionLayer(
+                hidden_size=hidden_size,
+                num_attention_heads=num_attention_heads,
+                attention_dropout=attention_dropout,
+                hidden_dropout=hidden_dropout,
+            )
+
+        self.intermediate = ActivationLayer(
+            hidden_size=hidden_size, intermediate_size=intermediate_size, activation=activation
+        )
+        self.output = OutputLayer(
+            input_size=intermediate_size, hidden_size=hidden_size, dropout=hidden_dropout
+        )
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor,
+        head_mask: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ):
+        """
+        hidden_states : `torch.Tensor`
+            Shape `batch_size x seq_len x hidden_dim`
+        attention_mask : `torch.BoolTensor`, optional
+            Shape `batch_size x seq_len`
+        head_mask : `torch.BoolTensor`, optional
+        encoder_hidden_states : `torch.Tensor`, optional
+        encoder_attention_mask : `torch.Tensor`, optional
+        output_attentions : `bool`
+            Whether to also return the attention probabilities, default = `False`
+        """
+        attention_outputs = self.attention(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            output_attentions=output_attentions,
+        )
+        attention_output = attention_outputs[0]
+        outputs = attention_outputs[1:]  # add self attentions if we output attention weights
+
+        if encoder_hidden_states is not None:
+            assert hasattr(
+                self, "cross_attention"
+            ), f"If `encoder_hidden_states` are passed, {self} has to be instantiated "
+            "with cross-attention layers by setting `config.add_cross_attention=True`"
+
+            cross_attention_outputs = self.cross_attention(
+                attention_output,
+                attention_mask,
+                head_mask,
+                encoder_hidden_states,
+                encoder_attention_mask,
+                output_attentions,
+            )
+            attention_output = cross_attention_outputs[0]
+            outputs = (
+                outputs + cross_attention_outputs[1:]
+            )  # add cross attentions if we output attention weights
+
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.output(intermediate_output, attention_output)
+        outputs = (layer_output,) + outputs
+        return outputs
+
+    @classmethod
+    def _get_input_arguments(
+        cls,
+        pretrained_module: torch.nn.Module,
+        source="huggingface",
+        mapping: Optional[Dict[str, str]] = None,
+        **kwargs,
+    ):
+        submodules = cls._get_mapped_submodules(pretrained_module, source, mapping)
+
+        final_kwargs = {}
+
+        final_kwargs["hidden_size"] = submodules["attention.self.query"].in_features
+        final_kwargs["num_attention_heads"] = submodules["attention.self"].num_attention_heads
+        final_kwargs["attention_dropout"] = submodules["attention.self.dropout"].p
+        final_kwargs["hidden_dropout"] = submodules["attention.output.dropout"].p
+        final_kwargs["intermediate_size"] = submodules["intermediate.dense"].out_features
+
+        # We require the if block as `act_fn` is a function rather than a module,
+        # so `_get_mapped_submodules` does not automatically fix this.
+        if source == "huggingface":
+            final_kwargs["activation"] = getattr(submodules["intermediate"], "intermediate_act_fn")
+        else:
+            final_kwargs["activation"] = getattr(submodules["intermediate"], "act_fn")
+
+        final_kwargs["add_cross_attention"] = "cross_attention" in submodules
+
+        final_kwargs.update(**kwargs)
+
+        return final_kwargs
diff --git a/allennlp/modules/transformer/transformer_module.py b/allennlp/modules/transformer/transformer_module.py
new file mode 100644
index 00000000000..11b650d84ec
--- /dev/null
+++ b/allennlp/modules/transformer/transformer_module.py
@@ -0,0 +1,214 @@
+from typing import Optional, Dict, Union, List
+import logging
+import inspect
+
+import torch
+
+from allennlp.common import cached_transformers
+
+logger = logging.getLogger(__name__)
+
+
+class TransformerModule(torch.nn.Module):
+    """
+    Base class to help with generalized loading of pretrained weights.
+
+    `_huggingface_mapping` is an optional mapping for each class, that determines
+    any differences in the module names between the class modules and the huggingface model's
+    modules.
+
+    `_relevant_module` is an optional str or list of str which contains the expected name of the module
+    in the huggingface pretrained model. It can be a list to account for different names in different
+    models. The search is carried out in the order of the list.
+    """
+
+    _huggingface_mapping: Dict[str, str] = {}
+    _relevant_module: Optional[Union[str, List[str]]] = None
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+    @classmethod
+    def _get_mapping(
+        cls,
+        pretrained_module: Optional[torch.nn.Module] = None,
+        source="huggingface",
+        mapping: Optional[Dict[str, str]] = None,
+    ):
+        """
+        Returns the mapping to be used, based on the optional `pretrained_module`.
+        If `pretrained_module` is not given, the default module-level mapping is returned.
+        """
+        combined_mapping = {}
+        if "huggingface" in source:
+            combined_mapping.update(cls._huggingface_mapping)
+        if mapping is not None:
+            combined_mapping.update(mapping)
+        return combined_mapping
+
+    @classmethod
+    def _get_mapped_submodules(
+        cls, pretrained_module, source="huggingface", mapping: Optional[Dict[str, str]] = None
+    ):
+        """
+        Subclasses overload this method, and provide appropriate name mapping based on the source.
+        """
+        submodules = dict(pretrained_module.named_modules())
+        combined_mapping = cls._get_mapping(pretrained_module, source, mapping)
+        for name, module in pretrained_module.named_modules():
+            newname = name
+            for key, val in combined_mapping.items():
+                newname = newname.replace(key, val)
+            submodules[newname] = submodules.pop(name)
+        return submodules
+
+    def _construct_default_mapping(
+        self,
+        pretrained_module,
+        source: str = "huggingface",
+        mapping: Optional[Dict[str, str]] = None,
+    ):
+        """
+        Recursively constructs the default mapping of parameter names for loading pretrained module weights.
+        Keys are parameter names from this module, and values are corresponding parameter names in the
+        expected pretrained module, as per `source`.
+        """
+        combined_mapping = self._get_mapping(pretrained_module, source, mapping)
+        for name, module in self.named_modules():
+            if name != "":
+                if hasattr(module, "_construct_default_mapping"):
+                    # We handle collisions by giving priority to the outer module's mapping.
+                    combined_mapping = dict(
+                        list(
+                            module._construct_default_mapping(
+                                pretrained_module, source, combined_mapping
+                            ).items()
+                        )
+                        + list(combined_mapping.items())
+                    )
+        return combined_mapping
+
+    def _load_from_pretrained_module(
+        self,
+        pretrained_module: torch.nn.Module,
+        source="huggingface",
+        mapping: Optional[Dict[str, str]] = None,
+        ignore_absent_parameters: Optional[List] = None,
+    ):
+        """
+        Loads the weights of the `pretrained_module` into the instance.
+        Optionally, a `mapping` is specified for any differences in parameter names
+        between `pretrained_module` and the instance.
+        """
+        ignore_absent_parameters = ignore_absent_parameters or []
+        combined_mapping = self._construct_default_mapping(pretrained_module, source, mapping)
+        if mapping is not None:
+            combined_mapping.update(mapping)
+
+        inverse_mapping = {val: key for key, val in combined_mapping.items()}
+        pretrained_parameters = dict(pretrained_module.named_parameters())
+        for name, parameter in self.named_parameters():
+            pretrained_name = name
+            for key, val in inverse_mapping.items():
+                # so that we replace the names of submodules too.
+                # eg. module.key.anothermodule --> module.val.anothermodule
+                pretrained_name = pretrained_name.replace(key, val)
+            if not any(
+                [pretrained_name.startswith(paraname) for paraname in ignore_absent_parameters]
+            ):
+                if pretrained_name not in pretrained_parameters:
+                    raise ValueError(
+                        f"Couldn't find a matching parameter for {name}. Is this module "
+                        "compatible with the pretrained module you're using?"
+                    )
+                parameter.data.copy_(pretrained_parameters[pretrained_name].data)
+
+    @classmethod
+    def _get_input_arguments(
+        cls,
+        pretrained_module: torch.nn.Module,
+        source="huggingface",
+        mapping: Optional[Dict[str, str]] = None,
+        **kwargs,
+    ):
+        """
+        Constructs the arguments required for instantiating an object of this class, using
+        the values from `pretrained_module`.
+        """
+        return kwargs
+
+    @classmethod
+    def get_relevant_module(
+        cls,
+        pretrained_module: Union[str, torch.nn.Module],
+        relevant_module: Optional[Union[str, List[str]]] = None,
+        source="huggingface",
+        mapping: Optional[Dict[str, str]] = None,
+    ):
+        """
+        Returns the relevant underlying module given a model name/object.
+
+        # Parameters:
+
+        pretrained_module: Name of the transformer model containing the layer,
+                           or the actual layer (not the model object).
+        relevant_module: Name of the desired module. Defaults to cls._relevant_module.
+        source: Where the model came from. Default - huggingface.
+        mapping: Optional mapping that determines any differences in the module names
+        between the class modules and the input model's modules. Default - cls._huggingface_mapping
+        """
+        if isinstance(pretrained_module, str):
+            pretrained_module = cached_transformers.get(pretrained_module, False)
+
+        relevant_module = relevant_module or cls._relevant_module
+
+        if relevant_module is not None:
+            submodules = cls._get_mapped_submodules(pretrained_module, source, mapping)
+            # If the relevant_module is not found, we assume that the pretrained_module
+            # is already the relevant module.
+            if isinstance(relevant_module, str):
+                relevant_module = [relevant_module]
+            found = False
+            for module in relevant_module:
+                if module in submodules:
+                    pretrained_module = submodules[module]
+                    found = True
+                    break
+
+            if not found:
+                logger.warning(
+                    "{} was not found! The submodules are: {}".format(
+                        relevant_module, submodules.keys()
+                    )
+                )
+        return pretrained_module
+
+    @classmethod
+    def from_pretrained_module(
+        cls,
+        pretrained_module: Union[str, torch.nn.Module],
+        source="huggingface",
+        mapping: Optional[Dict[str, str]] = None,
+        **kwargs,
+    ):
+        """
+        Creates and returns an instance of the class, by using the weights
+        (and the architecture, by default) of the `pretrained_module`.
+        Optionally, the architecture can be changed by providing arguments.
+        """
+        accepted_args = inspect.getfullargspec(cls).args
+        accepted_args.remove("self")
+        for key in kwargs:
+            assert key in accepted_args, (
+                "{} is not a valid argument for creating an instance of `{}`. "
+                "Accepted arguments are {}.".format(key, cls.__name__, accepted_args)
+            )
+
+        pretrained_module = cls.get_relevant_module(
+            pretrained_module, source=source, mapping=mapping
+        )
+        final_kwargs = cls._get_input_arguments(pretrained_module, source, mapping)
+        final_kwargs.update(kwargs)
+        module = cls(**final_kwargs)
+        module._load_from_pretrained_module(pretrained_module, source, mapping)
+        return module
diff --git a/allennlp/modules/transformer/transformer_pooler.py b/allennlp/modules/transformer/transformer_pooler.py
new file mode 100644
index 00000000000..198f3b56bc2
--- /dev/null
+++ b/allennlp/modules/transformer/transformer_pooler.py
@@ -0,0 +1,11 @@
+from allennlp.common import FromParams
+from allennlp.modules.transformer.activation_layer import ActivationLayer
+
+
+class TransformerPooler(ActivationLayer, FromParams):
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+    ):
+        super().__init__(hidden_size, intermediate_size, "relu", pool=True)
diff --git a/allennlp/modules/transformer/transformer_stack.py b/allennlp/modules/transformer/transformer_stack.py
new file mode 100644
index 00000000000..edeefc27ba9
--- /dev/null
+++ b/allennlp/modules/transformer/transformer_stack.py
@@ -0,0 +1,188 @@
+from typing import Union, Optional, Dict
+import logging
+
+import torch
+
+from allennlp.common import FromParams
+
+from allennlp.modules.util import replicate_layers
+from allennlp.modules.transformer.transformer_layer import TransformerLayer
+from allennlp.modules.transformer.transformer_module import TransformerModule
+
+logger = logging.getLogger(__name__)
+
+
+class TransformerStack(TransformerModule, FromParams):
+    """
+    This module is the basic transformer stack.
+    Details in the paper:
+    [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding, Devlin et al, 2019]
+    (https://api.semanticscholar.org/CorpusID:52967399)
+
+    # Parameters
+
+    num_hidden_layers : `int`
+    layer : `TransformerLayer`, optional
+    hidden_size : `int`, optional
+        This needs to be provided if no `layer` argument is passed.
+    intermediate_size : `int`, optional
+        This needs to be provided if no `layer` argument is passed.
+    num_attention_heads : `int`
+    attention_dropout : `float` (default = `0.0`)
+        Dropout probability for the `SelfAttention` layer.
+    hidden_dropout : `float` (default = `0.0`)
+        Dropout probability for the `OutputLayer`.
+    activation : `Union[str, torch.nn.Module]` (default = `"relu"`)
+    add_cross_attention: `bool` (default = `False`)
+        If True, the `TransformerLayer` modules will have cross attention modules as well.
+        This is helpful when using the `TransformerStack` as a decoder.
+    """
+
+    _huggingface_mapping = {"layer": "layers"}
+    _relevant_module = "encoder"
+
+    def __init__(
+        self,
+        num_hidden_layers: int,
+        layer: Optional[TransformerLayer] = None,
+        hidden_size: Optional[int] = None,
+        intermediate_size: Optional[int] = None,
+        num_attention_heads: int = 8,
+        attention_dropout: float = 0.1,
+        hidden_dropout: float = 0.1,
+        activation: Union[str, torch.nn.Module] = "relu",
+        add_cross_attention: bool = False,
+    ):
+        super().__init__()
+
+        if layer is not None:
+            logger.warning(
+                "The `layer` argument has been specified. Any other arguments will be ignored."
+            )
+        else:
+            assert (hidden_size is not None) and (intermediate_size is not None), "As the `layer`"
+            "has not been provided, `hidden_size` and `intermediate_size` are"
+            "required to create `TransformerLayer`s."
+
+        layer = layer or TransformerLayer(
+            hidden_size,  # type: ignore
+            intermediate_size,  # type: ignore
+            num_attention_heads,
+            attention_dropout,
+            hidden_dropout,
+            activation,
+            add_cross_attention,
+        )
+        self.layers = replicate_layers(layer, num_hidden_layers)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+    ):
+        """
+        hidden_states : `torch.Tensor`
+            Shape `batch_size x seq_len x hidden_dim`
+        attention_mask : `torch.BoolTensor`, optional
+            Shape `batch_size x seq_len`
+        head_mask : `torch.BoolTensor`, optional
+        output_attentions : `bool`
+            Whether to also return the attention probabilities, default = `False`
+        output_hidden_states : `bool`
+            Whether to return the hidden_states for all layers, default = `False`
+        """
+        all_hidden_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+        all_cross_attentions = () if output_attentions and self._add_cross_attention else None
+        for i, layer_module in enumerate(self.layers):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)  # type: ignore
+
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+
+            layer_outputs = layer_module(
+                hidden_states,
+                attention_mask,
+                layer_head_mask,
+                encoder_hidden_states,
+                encoder_attention_mask,
+                output_attentions,
+            )
+            hidden_states = layer_outputs[0]
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)  # type: ignore
+                if self._add_cross_attention:
+                    all_cross_attentions = all_cross_attentions + (layer_outputs[2],)  # type: ignore
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)  # type: ignore
+
+        return tuple(
+            v
+            for v in [hidden_states, all_hidden_states, all_attentions, all_cross_attentions]
+            if v is not None
+        )
+
+    @classmethod
+    def _get_input_arguments(
+        cls,
+        pretrained_module: torch.nn.Module,
+        source="huggingface",
+        mapping: Optional[Dict[str, str]] = None,
+        **kwargs,
+    ):
+        submodules = cls._get_mapped_submodules(pretrained_module, source, mapping)
+
+        final_kwargs = {}
+
+        final_kwargs["num_hidden_layers"] = len(submodules["layers"])
+
+        final_kwargs["hidden_size"] = submodules["layers.0.attention.self.query"].in_features
+        final_kwargs["num_attention_heads"] = submodules[
+            "layers.0.attention.self"
+        ].num_attention_heads
+        final_kwargs["attention_dropout"] = submodules["layers.0.attention.self.dropout"].p
+        final_kwargs["hidden_dropout"] = submodules["layers.0.attention.output.dropout"].p
+        final_kwargs["intermediate_size"] = submodules["layers.0.intermediate.dense"].out_features
+
+        # We require the if block as `act_fn` is a function rather than a module,
+        # so `_get_mapped_submodules` does not automatically fix this.
+        if source == "huggingface":
+            final_kwargs["activation"] = getattr(
+                submodules["layers.0.intermediate"], "intermediate_act_fn"
+            )
+        else:
+            final_kwargs["activation"] = getattr(submodules["layers.0.intermediate"], "act_fn")
+
+        final_kwargs["add_cross_attention"] = "layers.0.cross_attention" in submodules
+
+        final_kwargs.update(**kwargs)
+
+        return final_kwargs
+
+    @classmethod
+    def from_pretrained_module(  # type: ignore
+        cls,
+        pretrained_module: Union[str, torch.nn.Module],
+        num_hidden_layers: Optional[Union[int, range]] = None,
+        source="huggingface",
+        mapping: Optional[Dict[str, str]] = None,
+        **kwargs,
+    ):
+        final_kwargs = {}
+        if num_hidden_layers is not None:
+            if isinstance(num_hidden_layers, range):
+                if mapping is None:
+                    mapping = {}
+                    for num_layer, mapped in enumerate(num_hidden_layers):
+                        mapping[str(mapped)] = str(num_layer)
+                final_kwargs["num_hidden_layers"] = len(num_hidden_layers)
+            else:
+                final_kwargs["num_hidden_layers"] = num_hidden_layers
+
+        return super().from_pretrained_module(pretrained_module, source, mapping, **final_kwargs)
diff --git a/allennlp/modules/transformer/util.py b/allennlp/modules/transformer/util.py
new file mode 100644
index 00000000000..33dfcf77859
--- /dev/null
+++ b/allennlp/modules/transformer/util.py
@@ -0,0 +1,25 @@
+from typing import Union
+import torch
+
+
+def apply_mask(
+    values: torch.FloatTensor, mask: Union[torch.BoolTensor, torch.IntTensor, torch.FloatTensor]
+) -> torch.FloatTensor:
+    """
+    # Parameters
+
+    values : `torch.FloatTensor`
+        Shape `batch_size x num_attention_heads x source_seq_len x target_seq_len`
+    mask : `torch.BoolTensor`
+        Shape `batch_size x target_seq_len` OR `batch_size x 1 x 1 x target_seq_len`
+    """
+    if len(mask.shape) == 2:
+        # We create a 4D attention mask from a 2D tensor mask.
+        # The shape is `batch_size x 1 x 1 x target_seq_len` which is broadcast
+        # to `batch_size x num_attention_heads x source_seq_len x target_seq_len`
+        mask = mask.unsqueeze(1).unsqueeze(2)
+    # `mask==1` to convert float tensors.
+    mask = (
+        ~(mask == 1)
+    ) * -10e5  # -10e5 to ensure that the model also works in half-precision mode.
+    return values + mask
diff --git a/allennlp/modules/util.py b/allennlp/modules/util.py
new file mode 100644
index 00000000000..0c8c887c569
--- /dev/null
+++ b/allennlp/modules/util.py
@@ -0,0 +1,14 @@
+from copy import deepcopy
+import torch
+
+
+def replicate_layers(layer: torch.nn.Module, num_copies: int):
+    """
+    # Parameters
+            layer (torch.nn.Module) - The torch layer that needs to be replicated.
+            num_copies (int) - Number of copies to create.
+
+    # Returns
+            A ModuleList that contains `num_copies` of the `layer`.
+    """
+    return torch.nn.ModuleList([deepcopy(layer) for _ in range(num_copies)])
diff --git a/allennlp/modules/vision/__init__.py b/allennlp/modules/vision/__init__.py
new file mode 100644
index 00000000000..6fbebe2bcb9
--- /dev/null
+++ b/allennlp/modules/vision/__init__.py
@@ -0,0 +1,9 @@
+from allennlp.modules.vision.grid_embedder import GridEmbedder, ResnetBackbone
+from allennlp.modules.vision.image2image import (
+    Image2ImageModule,
+    NormalizeImage,
+)
+from allennlp.modules.vision.region_detector import (
+    RegionDetector,
+    FasterRcnnRegionDetector,
+)
diff --git a/allennlp/modules/vision/grid_embedder.py b/allennlp/modules/vision/grid_embedder.py
new file mode 100644
index 00000000000..61fe929ead6
--- /dev/null
+++ b/allennlp/modules/vision/grid_embedder.py
@@ -0,0 +1,69 @@
+from collections import OrderedDict
+from typing import Tuple
+
+from torch import nn, FloatTensor, IntTensor
+import torchvision
+
+from allennlp.common.registrable import Registrable
+
+
+class GridEmbedder(nn.Module, Registrable):
+    """
+    A `GridEmbedder` takes a batch of images as a tensor with shape
+    `(batch_size, color_channels, height, width)`, and returns an ordered dictionary
+    of tensors with shape `(batch_size, *)`, each representing a specific feature.
+    """
+
+    def forward(self, images: FloatTensor, sizes: IntTensor) -> "OrderedDict[str, FloatTensor]":
+        raise NotImplementedError()
+
+    def get_feature_names(self) -> Tuple[str, ...]:
+        """
+        Returns the feature names, in order, i.e. the keys of the ordered output
+        dictionary from `.forward()`.
+        """
+        raise NotImplementedError()
+
+
+@GridEmbedder.register("null")
+class NullGridEmbedder(GridEmbedder):
+    """
+    A `GridEmbedder` that returns the input image as given.
+    """
+
+    def forward(self, images: FloatTensor, sizes: IntTensor) -> "OrderedDict[str, FloatTensor]":
+        out = OrderedDict()
+        out["0"] = images
+        return out
+
+    def get_feature_names(self) -> Tuple[str, ...]:
+        return ("0",)
+
+
+@GridEmbedder.register("resnet_backbone")
+class ResnetBackbone(GridEmbedder):
+    """
+    Runs an image through [ResNet](https://api.semanticscholar.org/CorpusID:206594692),
+    as implemented by [torchvision](https://pytorch.org/docs/stable/torchvision/models.html).
+    """
+
+    def __init__(self) -> None:
+        super().__init__()
+        detection_model = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained=True)
+        self.backbone = detection_model.backbone
+        # Don't need the rest of this.
+        del detection_model
+        self.feature_names = tuple(
+            [
+                self.backbone.body.return_layers[key]
+                for key in self.backbone.body.keys()
+                if key in self.backbone.body.return_layers
+            ]
+            + ["pool"]
+        )
+
+    def forward(self, images: FloatTensor, sizes: IntTensor) -> "OrderedDict[str, FloatTensor]":
+        return self.backbone(images)
+
+    def get_feature_names(self) -> Tuple[str, ...]:
+        return self.feature_names
diff --git a/allennlp/modules/vision/image2image.py b/allennlp/modules/vision/image2image.py
new file mode 100644
index 00000000000..7b6e0ca9a2d
--- /dev/null
+++ b/allennlp/modules/vision/image2image.py
@@ -0,0 +1,39 @@
+import torch
+from torch import nn, FloatTensor, IntTensor
+from typing import List
+
+from allennlp.common.registrable import Registrable
+
+
+class Image2ImageModule(nn.Module, Registrable):
+    """
+    An `Image2ImageModule` takes a batch of images as a tensor with the dimensions
+    `(batch_size, color_channels, height, width)`, and returns a tensor in the same format,
+    after applying some transformation on the images.
+    """
+
+    def forward(self, images: FloatTensor, sizes: IntTensor):
+        raise NotImplementedError()
+
+
+@Image2ImageModule.register("normalize")
+class NormalizeImage(Image2ImageModule):
+    """
+    Normalizes an image by subtracting the mean and dividing by the
+    standard deviation, separately for each channel.
+    """
+
+    def __init__(self, means: List[float], stds: List[float]):
+        super().__init__()
+        assert len(means) == len(stds)
+        self.means = torch.tensor(means, dtype=torch.float32)
+        self.stds = torch.tensor(stds, dtype=torch.float32)
+
+    def forward(self, images: FloatTensor, sizes: IntTensor):
+        assert images.size(1) == self.means.size(0)
+        self.means = self.means.to(images.device)
+        self.stds = self.stds.to(images.device)
+        images = images.transpose(1, -1)
+        images = images - self.means
+        images = images / self.stds
+        return images.transpose(-1, 1)
diff --git a/allennlp/modules/vision/region_detector.py b/allennlp/modules/vision/region_detector.py
new file mode 100644
index 00000000000..8ad6560932b
--- /dev/null
+++ b/allennlp/modules/vision/region_detector.py
@@ -0,0 +1,328 @@
+import itertools
+import random
+from collections import OrderedDict
+from typing import NamedTuple, Optional, List, Tuple
+
+import torch
+from torch import nn, FloatTensor, IntTensor, Tensor
+import torch.nn.functional as F
+import torchvision
+import torchvision.ops.boxes as box_ops
+
+from allennlp.common import Registrable
+
+
+class RegionDetectorOutput(NamedTuple):
+    """
+    The output type from the forward pass of a `RegionDetector`.
+    """
+
+    features: List[Tensor]
+    """
+    A list of tensors, each with shape `(num_boxes, feature_dim)`.
+    """
+
+    boxes: List[Tensor]
+    """
+    A list of tensors containing the coordinates for each box. Each has shape `(num_boxes, 4)`.
+    """
+
+    class_probs: Optional[List[Tensor]] = None
+    """
+    An optional list of tensors. These tensors can have shape `(num_boxes,)` or
+    `(num_boxes, *)` if probabilities for multiple classes are given.
+    """
+
+    class_labels: Optional[List[Tensor]] = None
+    """
+    An optional list of tensors that give the labels corresponding to the `class_probs`
+    tensors. This should be non-`None` whenever `class_probs` is, and each tensor
+    should have the same shape as the corresponding tensor from `class_probs`.
+    """
+
+
+class RegionDetector(nn.Module, Registrable):
+    """
+    A `RegionDetector` takes a batch of images, their sizes, and an ordered dictionary
+    of image features as input, and finds regions of interest (or "boxes") within those images.
+
+    Those regions of interest are described by three values:
+
+    - `features` (`List[Tensor]`): A feature vector for each region, which is a tensor of shape
+      `(num_boxes, feature_dim)`.
+    - `boxes` (`List[Tensor]`): The coordinates of each region within the original image, with shape
+      `(num_boxes, 4)`.
+    - `class_probs` (`Optional[List[Tensor]]`): Class probabilities from some object
+      detector that was used to find the regions of interest, with shape `(num_boxes,)`
+      or `(num_boxes, *)` if probabilities for more than one class are given.
+    - `class_labels` (`Optional[List[Tensor]]`): The labels corresponding to `class_probs`.
+      Each tensor in this list has the same shape as the corresponding tensor in `class_probs`.
+
+    """
+
+    def forward(
+        self,
+        images: FloatTensor,
+        sizes: IntTensor,
+        image_features: "OrderedDict[str, FloatTensor]",
+    ) -> RegionDetectorOutput:
+        raise NotImplementedError()
+
+
+@RegionDetector.register("random")
+class RandomRegionDetector(RegionDetector):
+    """
+    A `RegionDetector` that returns two proposals per image, for testing purposes.  The features for
+    the proposal are a random 10-dimensional vector, and the coordinates are the size of the image.
+    """
+
+    def __init__(self, seed: Optional[int] = None):
+        super().__init__()
+        self.random = random.Random(seed)
+
+    def _seeded_random_tensor(self, *shape: int, device) -> torch.FloatTensor:
+        """PyTorch's random functions can't take a random seed. There is only one global
+        random seed in torch, but that's not deterministic enough for us. So we use Python's
+        random source to make random tensors."""
+        result = torch.zeros(*shape, dtype=torch.float32, device=device)
+        for coordinates in itertools.product(*(range(size) for size in result.shape)):
+            result[coordinates] = self.random.uniform(-1, 1)
+        return result
+
+    def forward(
+        self,
+        images: FloatTensor,
+        sizes: IntTensor,
+        image_features: "OrderedDict[str, FloatTensor]",
+    ) -> RegionDetectorOutput:
+        batch_size, num_features, height, width = images.size()
+        features = [
+            self._seeded_random_tensor(2, 10, device=images.device) for _ in range(batch_size)
+        ]
+        boxes = [
+            torch.zeros(2, 4, dtype=torch.float32, device=images.device) for _ in range(batch_size)
+        ]
+        for image_num in range(batch_size):
+            boxes[image_num][0, 2] = sizes[image_num, 0]
+            boxes[image_num][0, 3] = sizes[image_num, 1]
+            boxes[image_num][1, 2] = sizes[image_num, 0]
+            boxes[image_num][1, 3] = sizes[image_num, 1]
+        return RegionDetectorOutput(features, boxes)
+
+
+@RegionDetector.register("faster_rcnn")
+class FasterRcnnRegionDetector(RegionDetector):
+    """
+    A [Faster R-CNN](https://arxiv.org/abs/1506.01497) pretrained region detector.
+
+    Unless you really know what you're doing, this should be used with the image
+    features created from the `ResnetBackbone` `GridEmbedder` and on images loaded
+    using the `TorchImageLoader` with the default settings.
+
+
+    !!! Note
+        This module does not have any trainable parameters by default.
+        All pretrained weights are frozen.
+
+    # Parameters
+
+    box_score_thresh : `float`, optional (default = `0.05`)
+        During inference, only proposal boxes / regions with a label classification score
+        greater than `box_score_thresh` will be returned.
+
+    box_nms_thresh : `float`, optional (default = `0.5`)
+        During inference, non-maximum suppression (NMS) will applied to groups of boxes
+        that share a common label.
+
+        NMS iteratively removes lower scoring boxes which have an intersection-over-union (IoU)
+        greater than `box_nms_thresh` with another higher scoring box.
+
+    max_boxes_per_image : `int`, optional (default = `100`)
+        During inference, at most `max_boxes_per_image` boxes will be returned. The
+        number of boxes returned will vary by image and will often be lower
+        than `max_boxes_per_image` depending on the values of `box_score_thresh`
+        and `box_nms_thresh`.
+    """
+
+    def __init__(
+        self,
+        *,
+        box_score_thresh: float = 0.05,
+        box_nms_thresh: float = 0.5,
+        max_boxes_per_image: int = 100,
+    ):
+        super().__init__()
+        self.detector = torchvision.models.detection.fasterrcnn_resnet50_fpn(
+            pretrained=True,
+            box_score_thresh=box_score_thresh,
+            box_nms_thresh=box_nms_thresh,
+            box_detections_per_img=max_boxes_per_image,
+        )
+        # Don't need this since the features will be calculated elsewhere.
+        del self.detector.backbone
+        # Freeze all weights.
+        for parameter in self.detector.parameters():
+            parameter.requires_grad = False
+
+    def forward(
+        self,
+        images: FloatTensor,
+        sizes: IntTensor,
+        image_features: "OrderedDict[str, FloatTensor]",
+    ) -> RegionDetectorOutput:
+        """
+        Extract regions and region features from the given images.
+
+        In most cases `image_features` should come directly from the `ResnetBackbone`
+        `GridEmbedder`. The `images` themselves should be standardized and resized
+        using the default settings for the `TorchImageLoader`.
+        """
+        if self.training:
+            raise RuntimeError(
+                "FasterRcnnRegionDetector can not be used for training at the moment"
+            )
+
+        # Adapted from https://github.com/pytorch/vision/blob/
+        # 4521f6d152875974e317fa247a633e9ad1ea05c8/torchvision/models/detection/generalized_rcnn.py#L45
+        # We re-implement essentially the same forward eval pass except that we
+        # skip calling the backbone since we already have the `image_features`,
+        # and we also unpack the call to `roi_heads` so that we can keep the `box_features`
+        # that are created here:
+        # https://github.com/pytorch/vision/blob/
+        # 4521f6d152875974e317fa247a633e9ad1ea05c8/torchvision/models/detection/roi_heads.py#L752-L753
+
+        image_shapes: List[Tuple[int, int]] = list((int(h), int(w)) for (h, w) in sizes)
+        image_list = torchvision.models.detection.image_list.ImageList(images, image_shapes)
+
+        # `proposals` is a list of tensors, one tensor per image, each representing a
+        # fixed number of proposed regions/boxes.
+        # shape (proposals[i]): (proposals_per_image, 4)
+        proposals: List[Tensor]
+        proposals, _ = self.detector.rpn(image_list, image_features)
+
+        # shape: (batch_size * proposals_per_image, *)
+        box_features = self.detector.roi_heads.box_roi_pool(image_features, proposals, image_shapes)
+
+        # shape: (batch_size * proposals_per_image, *)
+        box_features = self.detector.roi_heads.box_head(box_features)
+
+        # shape (class_logits): (batch_size * proposals_per_image, num_classes)
+        # shape (box_regression): (batch_size * proposals_per_image, regression_output_size)
+        class_logits, box_regression = self.detector.roi_heads.box_predictor(box_features)
+
+        # This step filters down the `proposals` to only detections that reach
+        # a certain threshold.
+        # Each of these is a list of tensors, one for each image in the batch.
+        # shape (boxes[i]): (num_predicted_boxes, 4)
+        # shape (features[i]): (num_predicted_boxes, feature_size)
+        # shape (scores[i]): (num_predicted_classes,)
+        # shape (labels[i]): (num_predicted_classes,)
+        boxes, features, scores, labels = self._postprocess_detections(
+            class_logits, box_features, box_regression, proposals, image_shapes
+        )
+
+        return RegionDetectorOutput(features, boxes, scores, labels)
+
+    def _postprocess_detections(
+        self,
+        class_logits: Tensor,
+        box_features: Tensor,
+        box_regression: Tensor,
+        proposals: List[Tensor],
+        image_shapes: List[Tuple[int, int]],
+    ) -> Tuple[List[Tensor], List[Tensor], List[Tensor], List[Tensor]]:
+        """
+        Adapted from https://github.com/pytorch/vision/blob/
+        4521f6d152875974e317fa247a633e9ad1ea05c8/torchvision/models/detection/roi_heads.py#L664.
+
+        The only reason we have to re-implement this method is so we can pull out the box
+        features that we want.
+        """
+        device = class_logits.device
+        num_classes = class_logits.shape[-1]
+
+        boxes_per_image = [boxes_in_image.shape[0] for boxes_in_image in proposals]
+
+        # shape: (batch_size * boxes_per_image, num_classes, 4)
+        pred_boxes = self.detector.roi_heads.box_coder.decode(box_regression, proposals)
+
+        pred_scores = F.softmax(class_logits, -1)
+
+        pred_boxes_list = pred_boxes.split(boxes_per_image, 0)
+        features_list = box_features.split(boxes_per_image, dim=0)
+        pred_scores_list = pred_scores.split(boxes_per_image, 0)
+
+        all_boxes = []
+        all_features = []
+        all_scores = []
+        all_labels = []
+        for boxes, features, scores, image_shape in zip(
+            pred_boxes_list, features_list, pred_scores_list, image_shapes
+        ):
+            # shape: (boxes_per_image, num_classes, 4)
+            boxes = box_ops.clip_boxes_to_image(boxes, image_shape)
+
+            # shape: (boxes_per_image, num_classes, feature_size)
+            features = features.unsqueeze(1).expand(boxes.shape[0], boxes.shape[1], -1)
+
+            # create labels for each prediction
+            # shape: (num_classes,)
+            labels = torch.arange(num_classes, device=device)
+            # shape: (boxes_per_image, num_classes,)
+            labels = labels.view(1, -1).expand_as(scores)
+
+            # remove predictions with the background label
+            # shape: (boxes_per_image, num_classes - 1, 4)
+            boxes = boxes[:, 1:]
+            # shape: (boxes_per_image, num_classes, feature_size)
+            features = features[:, 1:]
+            # shape: (boxes_per_image, num_classes - 1,)
+            scores = scores[:, 1:]
+            # shape: (boxes_per_image, num_classes - 1,)
+            labels = labels[:, 1:]
+
+            # batch everything, by making every class prediction be a separate instance
+            # shape: (boxes_per_image * (num_classes - 1), 4)
+            boxes = boxes.reshape(-1, 4)
+            # shape: (boxes_per_image * (num_classes - 1), feature_size)
+            features = features.reshape(boxes.shape[0], -1)
+            # shape: (boxes_per_image * (num_classes - 1),)
+            scores = scores.reshape(-1)
+            # shape: (boxes_per_image * (num_classes - 1),)
+            labels = labels.reshape(-1)
+
+            # remove low scoring boxes
+            inds = torch.where(scores > self.detector.roi_heads.score_thresh)[0]
+            boxes, features, scores, labels = (
+                boxes[inds],
+                features[inds],
+                scores[inds],
+                labels[inds],
+            )
+
+            # remove empty boxes
+            keep = box_ops.remove_small_boxes(boxes, min_size=1e-2)
+            boxes, features, scores, labels = (
+                boxes[keep],
+                features[keep],
+                scores[keep],
+                labels[keep],
+            )
+
+            # non-maximum suppression, independently done per class
+            keep = box_ops.batched_nms(boxes, scores, labels, self.detector.roi_heads.nms_thresh)
+            # keep only topk scoring predictions
+            keep = keep[: self.detector.roi_heads.detections_per_img]
+            boxes, features, scores, labels = (
+                boxes[keep],
+                features[keep],
+                scores[keep],
+                labels[keep],
+            )
+
+            all_boxes.append(boxes)
+            all_features.append(features)
+            all_scores.append(scores)
+            all_labels.append(labels)
+
+        return all_boxes, all_features, all_scores, all_labels
diff --git a/allennlp/nn/activations.py b/allennlp/nn/activations.py
index 0d9f9925cc5..d2d329180db 100644
--- a/allennlp/nn/activations.py
+++ b/allennlp/nn/activations.py
@@ -86,6 +86,7 @@ def _get_name(self):
     "relu": (torch.nn.ReLU, None),
     "relu6": (torch.nn.ReLU6, None),
     "elu": (torch.nn.ELU, None),
+    "gelu": (torch.nn.GELU, None),
     "prelu": (torch.nn.PReLU, None),
     "leaky_relu": (torch.nn.LeakyReLU, None),
     "threshold": (torch.nn.Threshold, None),
@@ -98,5 +99,4 @@ def _get_name(self):
     "softsign": (torch.nn.Softsign, None),
     "tanhshrink": (torch.nn.Tanhshrink, None),
     "selu": (torch.nn.SELU, None),
-    "gelu": (torch.nn.GELU, None),
 }
diff --git a/allennlp/nn/util.py b/allennlp/nn/util.py
index 55831de5a54..9cc1c313156 100644
--- a/allennlp/nn/util.py
+++ b/allennlp/nn/util.py
@@ -22,41 +22,33 @@
 T = TypeVar("T")
 
 
-def has_tensor(obj) -> bool:
+def move_to_device(obj, device: Union[torch.device, int]):
     """
-    Given a possibly complex data structure,
-    check if it has any torch.Tensors in it.
+    Given a structure (possibly) containing Tensors,
+    move all the Tensors to the specified device (or do nothing, if they are already on
+    the target device).
     """
+    device = int_to_device(device)
+
     if isinstance(obj, torch.Tensor):
-        return True
+        # You may be wondering why we don't just always call `obj.to(device)` since that would
+        # be a no-op anyway if `obj` is already on `device`. Well that works fine except
+        # when PyTorch is not compiled with CUDA support, in which case even calling
+        # `obj.to(torch.device("cpu"))` would result in an error.
+        return obj if obj.device == device else obj.to(device=device)
     elif isinstance(obj, dict):
-        return any(has_tensor(value) for value in obj.values())
-    elif isinstance(obj, (list, tuple)):
-        return any(has_tensor(item) for item in obj)
-    else:
-        return False
-
-
-def move_to_device(obj, cuda_device: Union[torch.device, int]):
-    """
-    Given a structure (possibly) containing Tensors on the CPU,
-    move all the Tensors to the specified GPU (or do nothing, if they should be on the CPU).
-    """
-    cuda_device = int_to_device(cuda_device)
-
-    if cuda_device == torch.device("cpu") or not has_tensor(obj):
+        for key, value in obj.items():
+            obj[key] = move_to_device(value, device)
         return obj
-    elif isinstance(obj, torch.Tensor):
-        return obj.cuda(cuda_device)
-    elif isinstance(obj, dict):
-        return {key: move_to_device(value, cuda_device) for key, value in obj.items()}
     elif isinstance(obj, list):
-        return [move_to_device(item, cuda_device) for item in obj]
+        for i, item in enumerate(obj):
+            obj[i] = move_to_device(item, device)
+        return obj
     elif isinstance(obj, tuple) and hasattr(obj, "_fields"):
         # This is the best way to detect a NamedTuple, it turns out.
-        return obj.__class__(*(move_to_device(item, cuda_device) for item in obj))
+        return obj.__class__(*(move_to_device(item, device) for item in obj))
     elif isinstance(obj, tuple):
-        return tuple(move_to_device(item, cuda_device) for item in obj)
+        return tuple(move_to_device(item, device) for item in obj)
     else:
         return obj
 
@@ -1231,7 +1223,7 @@ def batched_index_select(
 
     An example use case of this function is looking up the start and end indices of spans in a
     sequence tensor. This is used in the
-    [CoreferenceResolver](https://docs.allennlp.org/models/master/models/coref/models/coref/)
+    [CoreferenceResolver](https://docs.allennlp.org/models/main/models/coref/models/coref/)
     model to select contextual word representations corresponding to the start and end indices of
     mentions.
 
diff --git a/allennlp/predictors/__init__.py b/allennlp/predictors/__init__.py
index 99ca930ae0e..aa42fb4614b 100644
--- a/allennlp/predictors/__init__.py
+++ b/allennlp/predictors/__init__.py
@@ -9,3 +9,10 @@
 from allennlp.predictors.predictor import Predictor
 from allennlp.predictors.sentence_tagger import SentenceTaggerPredictor
 from allennlp.predictors.text_classifier import TextClassifierPredictor
+
+try:
+    from allennlp.predictors.vilbert_vqa import VilbertVqaPredictor
+    from allennlp.predictors.visual_entailment import VisualEntailmentPredictor
+except ImportError:
+    # vision-based predictors are not available if we don't have detectron.
+    pass
diff --git a/allennlp/predictors/predictor.py b/allennlp/predictors/predictor.py
index 3ea94182edb..92023e785da 100644
--- a/allennlp/predictors/predictor.py
+++ b/allennlp/predictors/predictor.py
@@ -67,6 +67,7 @@ def json_to_labeled_instances(self, inputs: JsonDict) -> List[Instance]:
         """
 
         instance = self._json_to_instance(inputs)
+        self._dataset_reader.apply_token_indexers(instance)
         outputs = self._model.forward_on_instance(instance)
         new_instances = self.predictions_to_labeled_instances(instance, outputs)
         return new_instances
@@ -104,6 +105,9 @@ def get_gradients(self, instances: List[Instance]) -> Tuple[Dict[str, Any], Dict
         embedding_gradients: List[Tensor] = []
         hooks: List[RemovableHandle] = self._register_embedding_gradient_hooks(embedding_gradients)
 
+        for instance in instances:
+            self._dataset_reader.apply_token_indexers(instance)
+
         dataset = Batch(instances)
         dataset.index_instances(self._model.vocab)
         dataset_tensor_dict = util.move_to_device(dataset.as_tensor_dict(), self.cuda_device)
@@ -255,6 +259,7 @@ def _add_output(mod, _, outputs):
             hook.remove()
 
     def predict_instance(self, instance: Instance) -> JsonDict:
+        self._dataset_reader.apply_token_indexers(instance)
         outputs = self._model.forward_on_instance(instance)
         return sanitize(outputs)
 
@@ -286,6 +291,8 @@ def predict_batch_json(self, inputs: List[JsonDict]) -> List[JsonDict]:
         return self.predict_batch_instance(instances)
 
     def predict_batch_instance(self, instances: List[Instance]) -> List[JsonDict]:
+        for instance in instances:
+            self._dataset_reader.apply_token_indexers(instance)
         outputs = self._model.forward_on_instances(instances)
         return sanitize(outputs)
 
diff --git a/allennlp/predictors/sentence_tagger.py b/allennlp/predictors/sentence_tagger.py
index bd33316fe1e..04d19e92916 100644
--- a/allennlp/predictors/sentence_tagger.py
+++ b/allennlp/predictors/sentence_tagger.py
@@ -16,7 +16,7 @@ class SentenceTaggerPredictor(Predictor):
     """
     Predictor for any model that takes in a sentence and returns
     a single set of tags for it.  In particular, it can be used with
-    the [`CrfTagger`](https://docs.allennlp.org/models/master/models/tagging/models/crf_tagger/)
+    the [`CrfTagger`](https://docs.allennlp.org/models/main/models/tagging/models/crf_tagger/)
     model and also the [`SimpleTagger`](../models/simple_tagger.md) model.
 
     Registered as a `Predictor` with name "sentence_tagger".
diff --git a/allennlp/training/__init__.py b/allennlp/training/__init__.py
index 1a393b71d8c..d14a0845f2b 100644
--- a/allennlp/training/__init__.py
+++ b/allennlp/training/__init__.py
@@ -4,8 +4,7 @@
 from allennlp.training.trainer import (
     Trainer,
     GradientDescentTrainer,
-    BatchCallback,
-    EpochCallback,
     TrainerCallback,
     TrackEpochCallback,
+    TensorBoardBatchMemoryUsage,
 )
diff --git a/allennlp/training/metric_tracker.py b/allennlp/training/metric_tracker.py
index 0daa3f070f0..c2ee482c8e7 100644
--- a/allennlp/training/metric_tracker.py
+++ b/allennlp/training/metric_tracker.py
@@ -1,4 +1,4 @@
-from typing import Optional, Iterable, Dict, Any
+from typing import Optional, Dict, Any, List, Union
 
 from allennlp.common.checks import ConfigurationError
 
@@ -10,66 +10,56 @@ class MetricTracker:
     `state_dict` / `load_state_dict` interface, so that it can be checkpointed along with
     your model and optimizer.
 
-    Some metrics improve by increasing; others by decreasing. Here you can either explicitly
-    supply `should_decrease`, or you can provide a `metric_name` in which case "should decrease"
-    is inferred from the first character, which must be "+" or "-".
+    Some metrics improve by increasing; others by decreasing. You can provide a
+    `metric_name` that starts with "+" to indicate an increasing metric, or "-"
+    to indicate a decreasing metric.
 
     # Parameters
 
+    metric_name : `Union[str, List[str]]`
+        Specifies the metric or metrics to track. Metric names have to start with
+        "+" for increasing metrics or "-" for decreasing ones. If you specify more
+        than one, it tracks the sum of the increasing metrics metrics minus the sum
+        of the decreasing metrics.
     patience : `int`, optional (default = `None`)
         If provided, then `should_stop_early()` returns True if we go this
         many epochs without seeing a new best value.
-    metric_name : `str`, optional (default = `None`)
-        If provided, it's used to infer whether we expect the metric values to
-        increase (if it starts with "+") or decrease (if it starts with "-").
-        It's an error if it doesn't start with one of those. If it's not provided,
-        you should specify `should_decrease` instead.
-    should_decrease : `str`, optional (default = `None`)
-        If `metric_name` isn't provided (in which case we can't infer `should_decrease`),
-        then you have to specify it here.
     """
 
     def __init__(
-        self, patience: Optional[int] = None, metric_name: str = None, should_decrease: bool = None
+        self,
+        metric_name: Union[str, List[str]],
+        patience: Optional[int] = None,
     ) -> None:
-        self._best_so_far: Optional[float] = None
         self._patience = patience
+        self._best_so_far: Optional[float] = None
         self._epochs_with_no_improvement = 0
         self._is_best_so_far = True
-        self.best_epoch_metrics: Dict[str, float] = {}
         self._epoch_number = 0
         self.best_epoch: Optional[int] = None
+        self.best_epoch_metrics: Dict[str, float] = {}
 
-        # If the metric name starts with "+", we want it to increase.
-        # If the metric name starts with "-", we want it to decrease.
-        # We also allow you to not specify a metric name and just set `should_decrease` directly.
-        if should_decrease is not None and metric_name is not None:
-            raise ConfigurationError(
-                "must specify either `should_decrease` or `metric_name` (but not both)"
-            )
-        elif metric_name is not None:
-            if metric_name[0] == "-":
-                self._should_decrease = True
-            elif metric_name[0] == "+":
-                self._should_decrease = False
+        if isinstance(metric_name, str):
+            metric_name = [metric_name]
+        self.tracked_metrics = []
+        for name in metric_name:
+            if name.startswith("+"):
+                self.tracked_metrics.append((1.0, name[1:]))
+            elif name.startswith("-"):
+                self.tracked_metrics.append((-1.0, name[1:]))
             else:
                 raise ConfigurationError("metric_name must start with + or -")
-        elif should_decrease is not None:
-            self._should_decrease = should_decrease
-        else:
-            raise ConfigurationError(
-                "must specify either `should_decrease` or `metric_name` (but not both)"
-            )
 
     def clear(self) -> None:
         """
-        Clears out the tracked metrics, but keeps the patience and should_decrease settings.
+        Clears out the tracked metrics, but keeps the patience
         """
         self._best_so_far = None
         self._epochs_with_no_improvement = 0
         self._is_best_so_far = True
         self._epoch_number = 0
         self.best_epoch = None
+        self.best_epoch_metrics.clear()
 
     def state_dict(self) -> Dict[str, Any]:
         """
@@ -77,13 +67,11 @@ def state_dict(self) -> Dict[str, Any]:
         """
         return {
             "best_so_far": self._best_so_far,
-            "patience": self._patience,
             "epochs_with_no_improvement": self._epochs_with_no_improvement,
             "is_best_so_far": self._is_best_so_far,
-            "should_decrease": self._should_decrease,
-            "best_epoch_metrics": self.best_epoch_metrics,
             "epoch_number": self._epoch_number,
             "best_epoch": self.best_epoch,
+            "best_epoch_metrics": self.best_epoch_metrics,
         }
 
     def load_state_dict(self, state_dict: Dict[str, Any]) -> None:
@@ -91,41 +79,41 @@ def load_state_dict(self, state_dict: Dict[str, Any]) -> None:
         A `Trainer` can use this to hydrate a metric tracker from a serialized state.
         """
         self._best_so_far = state_dict["best_so_far"]
-        self._patience = state_dict["patience"]
         self._epochs_with_no_improvement = state_dict["epochs_with_no_improvement"]
         self._is_best_so_far = state_dict["is_best_so_far"]
-        self._should_decrease = state_dict["should_decrease"]
-        self.best_epoch_metrics = state_dict["best_epoch_metrics"]
         self._epoch_number = state_dict["epoch_number"]
         self.best_epoch = state_dict["best_epoch"]
 
-    def add_metric(self, metric: float) -> None:
+        # Even though we don't promise backwards compatibility for the --recover flag,
+        # it's particularly easy and harmless to provide it here, so we do it.
+        self.best_epoch_metrics = state_dict.get("best_epoch_metrics", {})
+
+    def add_metrics(self, metrics: Dict[str, float]) -> None:
         """
         Record a new value of the metric and update the various things that depend on it.
         """
-        new_best = (
-            (self._best_so_far is None)
-            or (self._should_decrease and metric < self._best_so_far)
-            or (not self._should_decrease and metric > self._best_so_far)
-        )
+        try:
+            combined_score = sum(
+                factor * metrics[metric_name] for factor, metric_name in self.tracked_metrics
+            )
+        except KeyError as e:
+            raise ConfigurationError(
+                f"You configured the trainer to use the {e.args[0]}"
+                "metric for early stopping, but the model did not produce that metric."
+            )
+
+        new_best = (self._best_so_far is None) or (combined_score > self._best_so_far)
 
         if new_best:
-            self.best_epoch = self._epoch_number
-            self._is_best_so_far = True
-            self._best_so_far = metric
+            self._best_so_far = combined_score
             self._epochs_with_no_improvement = 0
+            self._is_best_so_far = True
+            self.best_epoch = self._epoch_number
         else:
-            self._is_best_so_far = False
             self._epochs_with_no_improvement += 1
+            self._is_best_so_far = False
         self._epoch_number += 1
 
-    def add_metrics(self, metrics: Iterable[float]) -> None:
-        """
-        Helper to add multiple metrics at once.
-        """
-        for metric in metrics:
-            self.add_metric(metric)
-
     def is_best_so_far(self) -> bool:
         """
         Returns true if the most recent value of the metric is the best so far.
diff --git a/allennlp/training/metrics/__init__.py b/allennlp/training/metrics/__init__.py
index 7dc6a4ee959..daf16cec5bd 100644
--- a/allennlp/training/metrics/__init__.py
+++ b/allennlp/training/metrics/__init__.py
@@ -17,7 +17,10 @@
     DEFAULT_EVALB_DIR,
 )
 from allennlp.training.metrics.fbeta_measure import FBetaMeasure
-from allennlp.training.metrics.fbeta_multi_label_measure import FBetaMultiLabelMeasure
+from allennlp.training.metrics.fbeta_multi_label_measure import (
+    FBetaMultiLabelMeasure,
+    F1MultiLabelMeasure,
+)
 from allennlp.training.metrics.f1_measure import F1Measure
 from allennlp.training.metrics.mean_absolute_error import MeanAbsoluteError
 from allennlp.training.metrics.metric import Metric
diff --git a/allennlp/training/metrics/fbeta_measure.py b/allennlp/training/metrics/fbeta_measure.py
index beb0bd95798..bd8cce644aa 100644
--- a/allennlp/training/metrics/fbeta_measure.py
+++ b/allennlp/training/metrics/fbeta_measure.py
@@ -4,7 +4,7 @@
 import torch.distributed as dist
 from overrides import overrides
 
-from allennlp.common.util import is_distributed
+from allennlp.common.util import is_distributed, nan_safe_tensor_divide
 from allennlp.common.checks import ConfigurationError
 from allennlp.training.metrics.metric import Metric
 
@@ -207,8 +207,8 @@ def get_metric(self, reset: bool = False):
 
         beta2 = self._beta ** 2
         # Finally, we have all our sufficient statistics.
-        precision = _prf_divide(tp_sum, pred_sum)
-        recall = _prf_divide(tp_sum, true_sum)
+        precision = nan_safe_tensor_divide(tp_sum, pred_sum)
+        recall = nan_safe_tensor_divide(tp_sum, true_sum)
         fscore = (1 + beta2) * precision * recall / (beta2 * precision + recall)
         fscore[tp_sum == 0] = 0.0
 
@@ -219,9 +219,9 @@ def get_metric(self, reset: bool = False):
         elif self._average == "weighted":
             weights = true_sum
             weights_sum = true_sum.sum()  # type: ignore
-            precision = _prf_divide((weights * precision).sum(), weights_sum)
-            recall = _prf_divide((weights * recall).sum(), weights_sum)
-            fscore = _prf_divide((weights * fscore).sum(), weights_sum)
+            precision = nan_safe_tensor_divide((weights * precision).sum(), weights_sum)
+            recall = nan_safe_tensor_divide((weights * recall).sum(), weights_sum)
+            fscore = nan_safe_tensor_divide((weights * fscore).sum(), weights_sum)
 
         if reset:
             self.reset()
@@ -251,18 +251,3 @@ def _true_negative_sum(self):
                 self._total_sum - self._pred_sum - self._true_sum + self._true_positive_sum
             )
             return true_negative_sum
-
-
-def _prf_divide(numerator, denominator):
-    """Performs division and handles divide-by-zero.
-
-    On zero-division, sets the corresponding result elements to zero.
-    """
-    result = numerator / denominator
-    mask = denominator == 0.0
-    if not mask.any():
-        return result
-
-    # remove nan
-    result[mask] = 0.0
-    return result
diff --git a/allennlp/training/tensorboard_writer.py b/allennlp/training/tensorboard_writer.py
index 2000f7077ea..4784372e667 100644
--- a/allennlp/training/tensorboard_writer.py
+++ b/allennlp/training/tensorboard_writer.py
@@ -6,7 +6,7 @@
 import torch
 
 from allennlp.common.from_params import FromParams
-from allennlp.data.dataloader import TensorDict
+from allennlp.data import TensorDict
 from allennlp.nn import util as nn_util
 from allennlp.training.optimizers import Optimizer
 from allennlp.training import util as training_util
diff --git a/allennlp/training/trainer.py b/allennlp/training/trainer.py
index 1b3a889ac62..81d18aca898 100644
--- a/allennlp/training/trainer.py
+++ b/allennlp/training/trainer.py
@@ -6,7 +6,7 @@
 import time
 import traceback
 from contextlib import contextmanager
-from typing import Any, Callable, Dict, Iterator, List, Optional, Tuple, Type, Union
+from typing import Any, Dict, Iterator, List, Optional, Tuple, Union
 
 from allennlp.common.util import int_to_device
 
@@ -20,10 +20,8 @@
 from allennlp.common import Lazy, Registrable, Tqdm
 from allennlp.common import util as common_util
 from allennlp.common.checks import ConfigurationError, check_for_gpu
-from allennlp.data import DataLoader
-from allennlp.data.dataloader import TensorDict
+from allennlp.data import DataLoader, TensorDict
 from allennlp.models.model import Model
-from allennlp.nn import util as nn_util
 from allennlp.training import util as training_util
 from allennlp.training.checkpointer import Checkpointer
 from allennlp.training.learning_rate_schedulers import LearningRateScheduler
@@ -81,7 +79,7 @@ def __init__(
 
         self._distributed = distributed
         self._rank = local_rank
-        self._master = self._rank == 0
+        self._primary = self._rank == 0
         self._world_size = world_size
 
     def train(self) -> Dict[str, Any]:
@@ -103,37 +101,32 @@ def get_checkpoint_state(self) -> Iterator[Tuple[Dict[str, Any], Dict[str, Any]]
         raise NotImplementedError
 
 
-class BatchCallback(Registrable):
+class TrainerCallback(Registrable):
     """
-    An optional callback that you can pass to the `GradientDescentTrainer` that will be called at
-    the end of every batch, during both training and validation.  The default implementation
-    does nothing. You can implement your own callback and do whatever you want, such as saving
-    predictions to disk or extra logging.
-    """
-
-    def __call__(
-        self,
-        trainer: "GradientDescentTrainer",
-        batch_inputs: List[List[TensorDict]],
-        batch_outputs: List[Dict[str, Any]],
-        batch_metrics: Dict[str, Any],
-        epoch: int,
-        batch_number: int,
-        is_training: bool,
-        is_master: bool,
-    ) -> None:
-        pass
+    A general callback object that handles multiple events.
 
+    This class has `on_batch`, `on_epoch`, and `on_end` methods, corresponding to
+    each callback type. Each one receives the state of the wrapper object as `self`.
+    This enables easier state sharing between related callbacks.
 
-@BatchCallback.register("tensorboard-memory-usage")
-class TensoboardBatchMemoryUsage(BatchCallback):
+    Also, this callback type is instantiated with `serialization_dir` and `on_start` is called
+    with the trainer instance as an argument. This might be handy in case of callback logging
+    and saving its own files next to the config/checkpoints/logs/etc.
     """
-    Logs the CPU and GPU memory usage to tensorboard on every batch.
 
-    This is mainly used for debugging as it can cause a significant slowdown in training.
-    """
+    def __init__(self, serialization_dir: str) -> None:
+        self.serialization_dir = serialization_dir
+        self.trainer: Optional["GradientDescentTrainer"] = None
 
-    def __call__(
+    def on_start(
+        self, trainer: "GradientDescentTrainer", is_primary: bool = True, **kwargs
+    ) -> None:
+        """
+        This callback hook is called before the training is started.
+        """
+        self.trainer = trainer
+
+    def on_batch(
         self,
         trainer: "GradientDescentTrainer",
         batch_inputs: List[List[TensorDict]],
@@ -142,104 +135,50 @@ def __call__(
         epoch: int,
         batch_number: int,
         is_training: bool,
-        is_master: bool,
+        is_primary: bool = True,
+        **kwargs,
     ) -> None:
-        # In the distributed case we need to call this from every worker, since every
-        # worker reports its own memory usage.
-        cpu_memory_usage = common_util.peak_cpu_memory()
-        gpu_memory_usage = common_util.peak_gpu_memory()
-        # But we only want to log from the master process.
-        if is_master:
-            trainer._tensorboard.log_memory_usage(cpu_memory_usage, gpu_memory_usage)
-
-
-BatchCallback.register("null")(BatchCallback)
-
-
-class EpochCallback(Registrable):
-    """
-    An optional callback that you can pass to the `GradientDescentTrainer` that will be called at
-    the end of every epoch (and before the start of training, with `epoch=-1`). The default
-    implementation does nothing. You can implement your own callback and do whatever you want, such
-    as additional modifications of the trainer's state in between epochs.
-    """
+        """
+        This callback hook is called after the end of each batch.
+        """
+        pass
 
-    def __call__(
+    def on_epoch(
         self,
         trainer: "GradientDescentTrainer",
         metrics: Dict[str, Any],
         epoch: int,
-        is_master: bool,
+        is_primary: bool = True,
+        **kwargs,
     ) -> None:
+        """
+        This callback hook is called after the end of each epoch.
+        """
         pass
 
-
-EpochCallback.register("null")(EpochCallback)
-
-
-@EpochCallback.register("track_epoch_callback")
-class TrackEpochCallback(EpochCallback):
-    """
-    A callback that you can pass to the `GradientDescentTrainer` to access the current epoch number
-    in your model during training. This callback sets `model.epoch`, which can be read inside of
-    `model.forward()`. Since the EpochCallback passes `epoch=-1`
-    at the start of the training, we set `model.epoch = epoch + 1` which now denotes the number of
-    completed epochs at a given training state.
-    """
-
-    def __init__(self):
-        super().__init__()
-
-    def __call__(
+    def on_end(
         self,
         trainer: "GradientDescentTrainer",
-        metrics: Dict[str, Any],
-        epoch: int,
-        is_master: bool,
+        metrics: Dict[str, Any] = None,
+        epoch: int = None,
+        is_primary: bool = True,
+        **kwargs,
     ) -> None:
-        trainer.model.epoch = epoch + 1
-
-
-_BasicCallback = Union[BatchCallback, EpochCallback]
-
-
-class _TrainerCallbackMeta(type):
-    def __new__(cls, name, bases, dct):
         """
-        Add subclasses that wrap the `TrainerCallback` into other interfaces.
+        This callback hook is called after the final training epoch.
         """
-        subtype = super().__new__(cls, name, bases, dct)
-        # These subtypes wrap the `TrainerCallback` into the `_BasicCallback` interfaces.
-        subtype.Batch = cls._make_callback_type(BatchCallback, subtype.on_batch)
-        subtype.Epoch = cls._make_callback_type(EpochCallback, subtype.on_epoch)
-        subtype.End = cls._make_callback_type(EpochCallback, subtype.on_end)
-        return subtype
-
-    @classmethod
-    def _make_callback_type(
-        cls,
-        call_type: Type[_BasicCallback],
-        call: Callable[[], None],
-    ) -> Type[_BasicCallback]:  # type: ignore
-        class _Wrapper(call_type):  # type: ignore
-            def __init__(self, trainer_callback: "TrainerCallback"):
-                self.trainer_callback = trainer_callback
+        pass
 
-            def __call__(self, trainer: "GradientDescentTrainer", *args, **kwargs):
-                call(self.trainer_callback, trainer, *args, **kwargs)  # type: ignore
 
-        return _Wrapper
+TrainerCallback.register("null")(TrainerCallback)
 
 
-class TrainerCallback(Registrable, metaclass=_TrainerCallbackMeta):
+@TrainerCallback.register("tensorboard-memory-usage")
+class TensorBoardBatchMemoryUsage(TrainerCallback):
     """
-    A general callback object that wraps all three types of callbacks into one.
-
-    Rather than a `__call__` method, this class has `on_batch`, `on_epoch`, and `on_end` methods, corresponding to
-    each callback type. Each one receives the state of the wrapper object as `self`. This enables easier state
-    sharing between related callbacks.
+    Logs the CPU and GPU memory usage to tensorboard on every batch.
 
-    Under the hood, this is a metaclass that creates wrapping subclasses each time a subclass is created.
+    This is mainly used for debugging as it can cause a significant slowdown in training.
     """
 
     def on_batch(
@@ -251,63 +190,42 @@ def on_batch(
         epoch: int,
         batch_number: int,
         is_training: bool,
-        is_master: bool,
+        is_primary: bool = True,
+        **kwargs,
     ) -> None:
-        """
-        This callback hook is called after the end of each batch. This is equivalent to `BatchCallback`.
-        """
-        pass
+        # In the distributed case we need to call this from every worker, since every
+        # worker reports its own memory usage.
+        cpu_memory_usage = common_util.peak_cpu_memory()
+        gpu_memory_usage = common_util.peak_gpu_memory()
+        # But we only want to log from the primary process.
+        if is_primary:
+            trainer._tensorboard.log_memory_usage(cpu_memory_usage, gpu_memory_usage)
 
-    def on_epoch(
-        self,
-        trainer: "GradientDescentTrainer",
-        metrics: Dict[str, Any],
-        epoch: int,
-        is_master: bool,
+
+@TrainerCallback.register("track_epoch_callback")
+class TrackEpochCallback(TrainerCallback):
+    """
+    A callback that you can pass to the `GradientDescentTrainer` to access the current epoch number
+    in your model during training. This callback sets `model.epoch`, which can be read inside of
+    `model.forward()`. We set `model.epoch = epoch + 1` which now denotes the number of
+    completed epochs at a given training state.
+    """
+
+    def on_start(
+        self, trainer: "GradientDescentTrainer", is_primary: bool = True, **kwargs
     ) -> None:
-        """
-        This callback hook is called after the end of each epoch. This is equivalent to `EpochCallback`.
-        """
-        pass
+        super().on_start(trainer, is_primary)
+        trainer.model.epoch = 0
 
-    def on_end(
+    def on_epoch(
         self,
         trainer: "GradientDescentTrainer",
         metrics: Dict[str, Any],
         epoch: int,
-        is_master: bool,
+        is_primary: bool = True,
+        **kwargs,
     ) -> None:
-        """
-        This callback hook is called after the final training epoch. The `epoch` is passed as an argument.
-        """
-        pass
-
-    def batch(self):
-        """
-        Construct a `BatchCallback` wrapper for this `TrainCallback`.
-
-        The `cls.Batch` type is created by the metaclass.
-        """
-        return self.Batch(self)
-
-    def epoch(self):
-        """
-        Construct an `EpochCallback` wrapper for this instance.
-
-        The `cls.Epoch` type is created by the metaclass.
-        """
-        return self.Epoch(self)
-
-    def end(self):
-        """
-        Construct an `EpochCallback` wrapping the `on_end` end-of-training hook.
-
-        The `cls.End` type is created by the metaclass.
-        """
-        return self.End(self)
-
-
-TrainerCallback.register("null")(TrainerCallback)
+        trainer.model.epoch = epoch + 1
 
 
 @Trainer.register("gradient_descent", constructor="from_partial_objects")
@@ -315,7 +233,7 @@ class GradientDescentTrainer(Trainer):
     """
     A trainer for doing supervised learning with gradient descent. It just takes a labeled dataset
     and a `DataLoader`, and uses the supplied `Optimizer` to learn the weights for your model over
-    some fixed number of epochs. You can also pass in a validation dataloader and enable early
+    some fixed number of epochs. You can also pass in a validation data_loader and enable early
     stopping. There are many other bells and whistles as well.
 
     Registered as a `Trainer` with the name "gradient_descent" (and is also the default `Trainer`).
@@ -355,11 +273,12 @@ class GradientDescentTrainer(Trainer):
         after `patience` epochs with no improvement. If given, it must be `> 0`.
         If None, early stopping is disabled.
 
-    validation_metric : `str`, optional (default=`"-loss"`)
+    validation_metric : `Union[str, List[str]]`, optional (default=`"-loss"`)
         Validation metric to measure for whether to stop training using patience
         and whether to serialize an `is_best` model each epoch. The metric name
         must be prepended with either "+" or "-", which specifies whether the metric
-        is an increasing or decreasing function.
+        is an increasing or decreasing function. If you specify more than one metric,
+        the metrics will be summed to make the `is_best` decision.
 
     validation_data_loader : `DataLoader`, optional (default=`None`)
         A `DataLoader` to use for the validation set.  If `None`, then
@@ -419,20 +338,9 @@ class GradientDescentTrainer(Trainer):
         parameters. This is necessary because we want the saved model to perform as well as the validated
         model if we load it later. But this may cause problems if you restart the training from checkpoint.
 
-    batch_callbacks : `List[BatchCallback]`, optional (default = `None`)
-        A list of callbacks that will be called at the end of every batch, during both train and
-        validation.
-
-    epoch_callbacks : `List[EpochCallback]`, optional (default = `None`)
-        A list of callbacks that will be called at the end of every epoch, and at the start of
-        training (with epoch = -1).
-
-    end_callbacks : `List[EpochCallback]`, optional (default = `None`)
-        A list of callbacks that will be called after the final epoch at the end of training. The type of the
-        callbacks is the same as `epoch_callbacks`.
-
     trainer_callbacks : `List[TrainerCallback]`, optional (default = `None`)
-        A list of callbacks that will be called at each batch, epoch, and at the start and end of training.
+        A list of callbacks that can be called at certain events: e.g. each batch, epoch, and at the start
+        and end of training, etc.
 
     distributed : `bool`, optional, (default = `False`)
         If set, PyTorch's `DistributedDataParallel` is used to train the model in multiple GPUs. This also
@@ -471,7 +379,7 @@ def __init__(
         optimizer: torch.optim.Optimizer,
         data_loader: DataLoader,
         patience: Optional[int] = None,
-        validation_metric: str = "-loss",
+        validation_metric: Union[str, List[str]] = "-loss",
         validation_data_loader: DataLoader = None,
         num_epochs: int = 20,
         serialization_dir: Optional[str] = None,
@@ -483,10 +391,7 @@ def __init__(
         momentum_scheduler: Optional[MomentumScheduler] = None,
         tensorboard_writer: TensorboardWriter = None,
         moving_average: Optional[MovingAverage] = None,
-        batch_callbacks: List[BatchCallback] = None,
-        epoch_callbacks: List[EpochCallback] = None,
-        end_callbacks: List[EpochCallback] = None,
-        trainer_callbacks: List[TrainerCallback] = None,
+        callbacks: List[TrainerCallback] = None,
         distributed: bool = False,
         local_rank: int = 0,
         world_size: int = 1,
@@ -500,7 +405,10 @@ def __init__(
         self.model = model
 
         self.data_loader = data_loader
+        self.data_loader.set_target_device(self.cuda_device)
         self._validation_data_loader = validation_data_loader
+        if self._validation_data_loader is not None:
+            self._validation_data_loader.set_target_device(self.cuda_device)
         self.optimizer = optimizer
 
         if patience is None:  # no early stopping
@@ -516,9 +424,7 @@ def __init__(
             )
 
         # For tracking is_best_so_far and should_stop_early
-        self._metric_tracker = MetricTracker(patience, validation_metric)
-        # Get rid of + or -
-        self._validation_metric = validation_metric[1:]
+        self._metric_tracker = MetricTracker(validation_metric, patience)
 
         self._num_epochs = num_epochs
 
@@ -532,14 +438,8 @@ def __init__(
         self._learning_rate_scheduler = learning_rate_scheduler
         self._momentum_scheduler = momentum_scheduler
         self._moving_average = moving_average
-        self._batch_callbacks = batch_callbacks or []
-        self._epoch_callbacks = epoch_callbacks or []
-        self._end_callbacks = end_callbacks or []
 
-        for callback in trainer_callbacks or []:
-            self._batch_callbacks.append(callback.batch())
-            self._epoch_callbacks.append(callback.epoch())
-            self._end_callbacks.append(callback.end())
+        self._callbacks = callbacks or []
 
         # We keep the total batch number as an instance variable because it
         # is used inside a closure for the hook which logs activations in
@@ -600,7 +500,6 @@ def batch_outputs(self, batch: TensorDict, for_training: bool) -> Dict[str, torc
         Does a forward pass on the given batch and returns the output dictionary that the model
         returns, after adding any specified regularization penalty to the loss (if training).
         """
-        batch = nn_util.move_to_device(batch, self.cuda_device)
         output_dict = self._pytorch_model(**batch)
 
         if for_training:
@@ -662,9 +561,9 @@ def _train_epoch(self, epoch: int) -> Dict[str, float]:
         except TypeError:
             num_training_batches = float("inf")
 
-        # Having multiple tqdm bars in case of distributed training will be a mess. Hence only the master's
+        # Having multiple tqdm bars in case of distributed training will be a mess. Hence only the primary's
         # progress is shown
-        if self._master:
+        if self._primary:
             batch_group_generator_tqdm = Tqdm.tqdm(
                 batch_group_generator, total=num_training_batches
             )
@@ -744,7 +643,7 @@ def _train_epoch(self, epoch: int) -> Dict[str, float]:
                 self._momentum_scheduler.step_batch(batch_num_total)
 
             param_updates = None
-            if self._tensorboard.should_log_histograms_this_batch() and self._master:
+            if self._tensorboard.should_log_histograms_this_batch() and self._primary:
                 # Get the magnitude of parameter updates for logging.  We need to do some
                 # computation before and after the optimizer step, and it's expensive because of
                 # GPU/CPU copies (necessary for large models, and for shipping to tensorboard), so
@@ -785,8 +684,8 @@ def _train_epoch(self, epoch: int) -> Dict[str, float]:
                 cuda_device=self.cuda_device,
             )
 
-            if self._master:
-                # Updating tqdm only for the master as the trainers wouldn't have one
+            if self._primary:
+                # Updating tqdm only for the primary as the trainers wouldn't have one
                 description = training_util.description_from_metrics(metrics)
                 batch_group_generator_tqdm.set_description(description, refresh=False)
                 self._tensorboard.log_batch(
@@ -800,8 +699,9 @@ def _train_epoch(self, epoch: int) -> Dict[str, float]:
 
                 if self._checkpointer is not None:
                     self._checkpointer.maybe_save_checkpoint(self, epoch, batches_this_epoch)
-            for callback in self._batch_callbacks:
-                callback(
+
+            for callback in self._callbacks:
+                callback.on_batch(
                     self,
                     batch_group,
                     batch_group_outputs,
@@ -809,7 +709,7 @@ def _train_epoch(self, epoch: int) -> Dict[str, float]:
                     epoch,
                     batches_this_epoch,
                     is_training=True,
-                    is_master=self._master,
+                    is_primary=self._primary,
                 )
 
         if self._distributed and not done_early:
@@ -865,9 +765,9 @@ def _validation_loss(self, epoch: int) -> Tuple[float, Optional[float], int]:
 
         regularization_penalty = self.model.get_regularization_penalty()
 
-        # Having multiple tqdm bars in case of distributed training will be a mess. Hence only the master's
+        # Having multiple tqdm bars in case of distributed training will be a mess. Hence only the primary's
         # progress is shown
-        if self._master:
+        if self._primary:
             val_generator_tqdm = Tqdm.tqdm(validation_data_loader)
         else:
             val_generator_tqdm = validation_data_loader
@@ -928,11 +828,11 @@ def _validation_loss(self, epoch: int) -> Tuple[float, Optional[float], int]:
             )
 
             description = training_util.description_from_metrics(val_metrics)
-            if self._master:
+            if self._primary:
                 val_generator_tqdm.set_description(description, refresh=False)
 
-            for callback in self._batch_callbacks:
-                callback(
+            for callback in self._callbacks:
+                callback.on_batch(
                     self,
                     [batch],
                     [batch_outputs],
@@ -940,7 +840,7 @@ def _validation_loss(self, epoch: int) -> Tuple[float, Optional[float], int]:
                     epoch,
                     batches_this_epoch,
                     is_training=False,
-                    is_master=self._master,
+                    is_primary=self._primary,
                 )
 
         if self._distributed and not done_early:
@@ -962,13 +862,24 @@ def train(self) -> Dict[str, Any]:
         """
         Trains the supplied model with the supplied parameters.
         """
+
+        for callback in self._callbacks:
+            callback.on_start(self, is_primary=self._primary)
+
+        # Set default values in case of failure
+        epoch = None
+        metrics = None
+
         try:
-            return self._try_train()
+            metrics, epoch = self._try_train()
+            return metrics
         finally:
             # make sure pending events are flushed to disk and files are closed properly
+            for callback in self._callbacks:
+                callback.on_end(self, metrics=metrics, epoch=epoch, is_primary=self._primary)
             self._tensorboard.close()
 
-    def _try_train(self) -> Dict[str, Any]:
+    def _try_train(self) -> Tuple[Dict[str, Any], int]:
         try:
             epoch_counter = self._restore_checkpoint()
         except RuntimeError:
@@ -993,17 +904,14 @@ def _try_train(self) -> Dict[str, Any]:
         for key, value in self._metric_tracker.best_epoch_metrics.items():
             metrics["best_validation_" + key] = value
 
-        for callback in self._epoch_callbacks:
-            callback(self, metrics={}, epoch=-1, is_master=self._master)
-
         for epoch in range(epoch_counter, self._num_epochs):
             epoch_start_time = time.time()
             train_metrics = self._train_epoch(epoch)
 
-            if self._master and self._checkpointer is not None:
+            if self._primary and self._checkpointer is not None:
                 self._checkpointer.save_checkpoint(epoch, self, save_model_only=True)
 
-            # Wait for the master to finish saving the model checkpoint
+            # Wait for the primary process to finish saving the model checkpoint
             if self._distributed:
                 dist.barrier()
 
@@ -1037,14 +945,13 @@ def _try_train(self) -> Dict[str, Any]:
                     )
 
                     # Check validation metric for early stopping
-                    this_epoch_val_metric = val_metrics[self._validation_metric]
-                    self._metric_tracker.add_metric(this_epoch_val_metric)
+                    self._metric_tracker.add_metrics(val_metrics)
 
                     if self._metric_tracker.should_stop_early():
                         logger.info("Ran out of patience.  Stopping training.")
                         break
 
-            if self._master:
+            if self._primary:
                 self._tensorboard.log_metrics(
                     train_metrics, val_metrics=val_metrics, log_to_console=True, epoch=epoch + 1
                 )  # +1 because tensorboard doesn't like 0
@@ -1070,7 +977,7 @@ def _try_train(self) -> Dict[str, Any]:
 
                 self._metric_tracker.best_epoch_metrics = val_metrics
 
-            if self._serialization_dir and self._master:
+            if self._serialization_dir and self._primary:
                 common_util.dump_metrics(
                     os.path.join(self._serialization_dir, f"metrics_epoch_{epoch}.json"),
                     metrics,
@@ -1083,17 +990,17 @@ def _try_train(self) -> Dict[str, Any]:
             if self._momentum_scheduler:
                 self._momentum_scheduler.step(this_epoch_val_metric)
 
-            if self._master and self._checkpointer is not None:
+            if self._primary and self._checkpointer is not None:
                 self._checkpointer.save_checkpoint(
                     epoch, self, is_best_so_far=self._metric_tracker.is_best_so_far()
                 )
 
-            # Wait for the master to finish saving the checkpoint
+            # Wait for the primary process to finish saving the checkpoint
             if self._distributed:
                 dist.barrier()
 
-            for callback in self._epoch_callbacks:
-                callback(self, metrics=metrics, epoch=epoch, is_master=self._master)
+            for callback in self._callbacks:
+                callback.on_epoch(self, metrics=metrics, epoch=epoch, is_primary=self._primary)
 
             epoch_elapsed_time = time.time() - epoch_start_time
             logger.info("Epoch duration: %s", datetime.timedelta(seconds=epoch_elapsed_time))
@@ -1107,9 +1014,8 @@ def _try_train(self) -> Dict[str, Any]:
                 logger.info("Estimated training time remaining: %s", formatted_time)
 
             epochs_trained += 1
-
-        for callback in self._end_callbacks:
-            callback(self, metrics=metrics, epoch=epoch, is_master=self._master)
+        else:
+            epoch = self._num_epochs - 1
 
         # Load the best model state before returning
         best_model_state = (
@@ -1118,7 +1024,7 @@ def _try_train(self) -> Dict[str, Any]:
         if best_model_state:
             self.model.load_state_dict(best_model_state)
 
-        return metrics
+        return metrics, epoch
 
     @contextmanager
     def get_checkpoint_state(self) -> Iterator[Tuple[Dict[str, Any], Dict[str, Any]]]:
@@ -1189,11 +1095,6 @@ def _restore_checkpoint(self) -> int:
         # Currently the `training_state` contains a serialized `MetricTracker`.
         if "metric_tracker" in training_state:
             self._metric_tracker.load_state_dict(training_state["metric_tracker"])
-        # It used to be the case that we tracked `val_metric_per_epoch`.
-        elif "val_metric_per_epoch" in training_state:
-            self._metric_tracker.clear()
-            self._metric_tracker.add_metrics(training_state["val_metric_per_epoch"])
-        # And before that we didn't track anything.
         else:
             self._metric_tracker.clear()
 
@@ -1219,7 +1120,7 @@ def from_partial_objects(
         validation_data_loader: DataLoader = None,
         local_rank: int = 0,
         patience: int = None,
-        validation_metric: str = "-loss",
+        validation_metric: Union[str, List[str]] = "-loss",
         num_epochs: int = 20,
         cuda_device: Optional[Union[int, torch.device]] = None,
         grad_norm: float = None,
@@ -1235,10 +1136,8 @@ def from_partial_objects(
         tensorboard_writer: Lazy[TensorboardWriter] = Lazy(TensorboardWriter),
         moving_average: Lazy[MovingAverage] = None,
         checkpointer: Lazy[Checkpointer] = Lazy(Checkpointer),
-        batch_callbacks: List[BatchCallback] = None,
-        epoch_callbacks: List[EpochCallback] = None,
-        end_callbacks: List[EpochCallback] = None,
-        trainer_callbacks: List[TrainerCallback] = None,
+        callbacks: List[Lazy[TrainerCallback]] = None,
+        trainer_callbacks: List[Lazy[TrainerCallback]] = None,
     ) -> "Trainer":
         """
         This method exists so that we can have a documented method to construct this class using
@@ -1304,6 +1203,14 @@ def from_partial_objects(
         checkpointer_ = checkpointer.construct(serialization_dir=serialization_dir)
         tensorboard_writer_ = tensorboard_writer.construct(serialization_dir=serialization_dir)
 
+        callbacks = callbacks or trainer_callbacks or []
+
+        callbacks_: List[TrainerCallback] = []
+
+        for callback in callbacks:
+            callback_ = callback.construct(serialization_dir=serialization_dir)
+            callbacks_.append(callback_)
+
         return cls(
             model,
             optimizer_,
@@ -1321,10 +1228,7 @@ def from_partial_objects(
             tensorboard_writer=tensorboard_writer_,
             checkpointer=checkpointer_,
             moving_average=moving_average_,
-            batch_callbacks=batch_callbacks,
-            epoch_callbacks=epoch_callbacks,
-            end_callbacks=end_callbacks,
-            trainer_callbacks=trainer_callbacks,
+            callbacks=callbacks_,
             distributed=distributed,
             local_rank=local_rank,
             world_size=world_size,
diff --git a/allennlp/training/util.py b/allennlp/training/util.py
index 7a79cb7a5e7..09feb3481f3 100644
--- a/allennlp/training/util.py
+++ b/allennlp/training/util.py
@@ -7,28 +7,22 @@
 import shutil
 import json
 from os import PathLike
-from typing import Any, Dict, Iterable, Optional, Union, Tuple, Set, List, TYPE_CHECKING
+from typing import Any, Dict, Iterable, Optional, Union, Tuple, Set, List
 from collections import Counter
 
 import torch
-
-# import torch.distributed as dist
-from torch.utils.data import DataLoader
 from torch.nn.utils import clip_grad_norm_
 
 from allennlp.common.checks import check_for_gpu, ConfigurationError
 from allennlp.common.params import Params
 from allennlp.common.tqdm import Tqdm
-from allennlp.common.util import dump_metrics, sanitize
-from allennlp.data import Instance, Vocabulary, Batch
+from allennlp.common.util import dump_metrics, sanitize, int_to_device
+from allennlp.data import Instance, Vocabulary, Batch, DataLoader
 from allennlp.data.dataset_readers import DatasetReader
 from allennlp.models.archival import CONFIG_NAME
 from allennlp.models.model import Model
 from allennlp.nn import util as nn_util
 
-if TYPE_CHECKING:
-    from allennlp.data import AllennlpDataset
-    from allennlp.data import AllennlpLazyDataset
 
 logger = logging.getLogger(__name__)
 
@@ -90,73 +84,42 @@ def str_to_time(time_str: str) -> datetime.datetime:
     return datetime.datetime(*pieces)
 
 
-def read_all_datasets(
-    train_data_path: str,
-    dataset_reader: DatasetReader,
-    validation_dataset_reader: DatasetReader = None,
-    validation_data_path: str = None,
-    test_data_path: str = None,
-) -> Dict[str, Union["AllennlpDataset", "AllennlpLazyDataset"]]:
-    """
-    Reads all datasets (perhaps lazily, if the corresponding dataset readers are lazy) and returns a
-    dictionary mapping dataset name ("train", "validation" or "test") to the iterable resulting from
-    `reader.read(filename)`.
-    """
-
-    logger.info("Reading training data from %s", train_data_path)
-    train_data = dataset_reader.read(train_data_path)
-
-    datasets = {"train": train_data}
-
-    validation_dataset_reader = validation_dataset_reader or dataset_reader
-
-    if validation_data_path is not None:
-        logger.info("Reading validation data from %s", validation_data_path)
-        validation_data = validation_dataset_reader.read(validation_data_path)
-        datasets["validation"] = validation_data
-
-    if test_data_path is not None:
-        logger.info("Reading test data from %s", test_data_path)
-        test_data = validation_dataset_reader.read(test_data_path)
-        datasets["test"] = test_data
-
-    return datasets
-
-
-def datasets_from_params(
+def data_loaders_from_params(
     params: Params,
     train: bool = True,
     validation: bool = True,
     test: bool = True,
     serialization_dir: Optional[Union[str, PathLike]] = None,
-) -> Dict[str, Union["AllennlpDataset", "AllennlpLazyDataset"]]:
+) -> Dict[str, DataLoader]:
     """
-    Load datasets specified by the config.
+    Instantiate data loaders specified by the config.
     """
-    datasets: Dict[str, Union["AllennlpDataset", "AllennlpLazyDataset"]] = {}
+    data_loaders: Dict[str, DataLoader] = {}
 
     train = train and ("train_data_path" in params)
     validation = validation and ("validation_data_path" in params)
     test = test and ("test_data_path" in params)
     if not any((train, validation, test)):
         # Return early so don't unnecessarily initialize the train data reader.
-        return datasets
+        return data_loaders
 
     dataset_reader_params = params.pop("dataset_reader")
     dataset_reader = DatasetReader.from_params(
         dataset_reader_params, serialization_dir=serialization_dir
     )
+    data_loader_params = params.pop("data_loader")
 
     if train:
         train_data_path = params.pop("train_data_path")
         logger.info("Reading training data from %s", train_data_path)
-        train_data = dataset_reader.read(train_data_path)
-        datasets["train"] = train_data
+        data_loaders["train"] = DataLoader.from_params(
+            data_loader_params.duplicate(), reader=dataset_reader, data_path=train_data_path
+        )
 
     if not validation and not test:
         # Return early so we don't unnecessarily initialize the validation/test data
         # reader.
-        return datasets
+        return data_loaders
 
     validation_and_test_dataset_reader: DatasetReader = dataset_reader
     validation_dataset_reader_params = params.pop("validation_dataset_reader", None)
@@ -166,19 +129,27 @@ def datasets_from_params(
             validation_dataset_reader_params, serialization_dir=serialization_dir
         )
 
+    validation_data_loader_params = params.pop("validation_data_loader", data_loader_params)
+
     if validation:
         validation_data_path = params.pop("validation_data_path")
         logger.info("Reading validation data from %s", validation_data_path)
-        validation_data = validation_and_test_dataset_reader.read(validation_data_path)
-        datasets["validation"] = validation_data
+        data_loaders["validation"] = DataLoader.from_params(
+            validation_data_loader_params.duplicate(),
+            reader=validation_and_test_dataset_reader,
+            data_path=validation_data_path,
+        )
 
     if test:
         test_data_path = params.pop("test_data_path")
         logger.info("Reading test data from %s", test_data_path)
-        test_data = validation_and_test_dataset_reader.read(test_data_path)
-        datasets["test"] = test_data
+        data_loaders["test"] = DataLoader.from_params(
+            validation_data_loader_params,
+            reader=validation_and_test_dataset_reader,
+            data_path=test_data_path,
+        )
 
-    return datasets
+    return data_loaders
 
 
 def create_serialization_dir(
@@ -344,6 +315,7 @@ def evaluate(
         The final metrics.
     """
     check_for_gpu(cuda_device)
+    data_loader.set_target_device(int_to_device(cuda_device))
     predictions_file = (
         None if predictions_output_file is None else open(predictions_output_file, "w")
     )
@@ -462,21 +434,21 @@ def make_vocab_from_params(
         "datasets_for_vocab_creation", None
     )
     # Do a quick sanity check here. There's no need to load any datasets if the vocab
-    # type is "empty".
-    if datasets_for_vocab_creation is None and vocab_params.get("type") in ("empty", "from_files"):
+    # type is "empty" or "from_files".
+    if datasets_for_vocab_creation is None and vocab_params.get("type") in {"empty", "from_files"}:
         datasets_for_vocab_creation = []
 
-    datasets: Dict[str, Union["AllennlpDataset", "AllennlpLazyDataset"]]
+    data_loaders: Dict[str, DataLoader]
     if datasets_for_vocab_creation is None:
         # If `datasets_for_vocab_creation` was not specified, we'll use all datasets
         # from the config.
-        datasets = datasets_from_params(params, serialization_dir=serialization_dir)
+        data_loaders = data_loaders_from_params(params, serialization_dir=serialization_dir)
     else:
         for dataset_name in datasets_for_vocab_creation:
             data_path = f"{dataset_name}_data_path"
             if data_path not in params:
                 raise ConfigurationError(f"invalid 'datasets_for_vocab_creation' {dataset_name}")
-        datasets = datasets_from_params(
+        data_loaders = data_loaders_from_params(
             params,
             serialization_dir=serialization_dir,
             train=("train" in datasets_for_vocab_creation),
@@ -486,9 +458,9 @@ def make_vocab_from_params(
 
     instances: Iterable[Instance] = (
         instance
-        for key, dataset in datasets.items()
+        for key, data_loader in data_loaders.items()
         if datasets_for_vocab_creation is None or key in datasets_for_vocab_creation
-        for instance in dataset
+        for instance in data_loader.iter_instances()
     )
 
     if print_statistics:
diff --git a/allennlp/version.py b/allennlp/version.py
index d478d167183..5dfd069590c 100644
--- a/allennlp/version.py
+++ b/allennlp/version.py
@@ -1,13 +1,13 @@
 import os
 
-_MAJOR = "1"
-_MINOR = "3"
+_MAJOR = "2"
+_MINOR = "0"
 # On main and in a nightly release the patch should be one ahead of the last
 # released build.
 _PATCH = "0"
 # This is mainly for nightly builds which have the suffix ".dev$DATE". See
 # https://semver.org/#is-v123-a-semantic-version for the semantics.
-_SUFFIX = os.environ.get("ALLENNLP_VERSION_SUFFIX", "")
+_SUFFIX = os.environ.get("ALLENNLP_VERSION_SUFFIX", "rc1")
 
 VERSION_SHORT = "{0}.{1}".format(_MAJOR, _MINOR)
 VERSION = "{0}.{1}.{2}{3}".format(_MAJOR, _MINOR, _PATCH, _SUFFIX)
diff --git a/mkdocs-skeleton.yml b/mkdocs-skeleton.yml
index 57a8d616357..e69e9fe8ea1 100644
--- a/mkdocs-skeleton.yml
+++ b/mkdocs-skeleton.yml
@@ -36,8 +36,12 @@ nav:
 
 markdown_extensions:
 - toc:
-    permalink: '#'
+    permalink: true
+    toc_depth: 3
+- markdown.extensions.codehilite:
+    guess_lang: true
 - admonition
+- codehilite
 - extra
 - pymdownx.highlight
 - pymdownx.superfences
diff --git a/scripts/py2md.py b/scripts/py2md.py
index 517b48ab6f4..798febaa6df 100755
--- a/scripts/py2md.py
+++ b/scripts/py2md.py
@@ -213,18 +213,20 @@ def process_node(self, node):
                         state.current_section = None
                 else:
                     state.consecutive_blank_line_count = 0
-                line = self._preprocess_line(line, state)
+                line = self._preprocess_line(node, line, state)
 
             lines.append(line)
 
         # Now set the docstring to our preprocessed version of it.
         node.docstring = "\n".join(lines)
 
-    def _preprocess_line(self, line, state: ProcessorState) -> str:
+    def _preprocess_line(self, node, line, state: ProcessorState) -> str:
         match = re.match(r"#+ (.*)$", line)
         if match:
             state.current_section = Section.from_str(match.group(1).strip())
-            line = re.sub(r"#+ (.*)$", r"<strong>\1</strong>\n", line)
+            name = match.group(1).strip()
+            slug = (node.name + "." + match.group(1).strip()).lower().replace(" ", "_")
+            line = f'<h4 id="{slug}">{name}<a class="headerlink" href="#{slug}" title="Permanent link">&para;</a></h4>\n'  # noqa: E501
         else:
             if line and not line.startswith(" ") and not line.startswith("!!! "):
                 if state.current_section in (
@@ -272,7 +274,7 @@ class AllenNlpFilterProcessor(Struct):
     Used to filter out nodes that we don't want to document.
     """
 
-    PRIVATE_METHODS_TO_KEEP = {"DatasetReader._read", "__call__"}
+    PRIVATE_METHODS_TO_KEEP = {"DatasetReader._read", "__call__", "__iter__"}
 
     def process(self, graph, _resolver):
         graph.visit(self._process_node)
diff --git a/scripts/tests/py2md/basic_example_expected_output.md b/scripts/tests/py2md/basic_example_expected_output.md
index 647376a70b9..fc04311dece 100644
--- a/scripts/tests/py2md/basic_example_expected_output.md
+++ b/scripts/tests/py2md/basic_example_expected_output.md
@@ -37,7 +37,7 @@ def func_with_args(a: int, b: int, c: int = 3) -> int
 
 This function has some args.
 
-<strong>Parameters</strong>
+<h4 id="func_with_args.parameters">Parameters<a class="headerlink" href="#func_with_args.parameters" title="Permanent link">&para;</a></h4>
 
 
 - __a__ : `int` <br>
@@ -47,11 +47,11 @@ This function has some args.
 - __c__ : `int`, optional (default = `3`) <br>
     Yet another number.
 
-<strong>Notes</strong>
+<h4 id="func_with_args.notes">Notes<a class="headerlink" href="#func_with_args.notes" title="Permanent link">&para;</a></h4>
 
 These are some notes.
 
-<strong>Returns</strong>
+<h4 id="func_with_args.returns">Returns<a class="headerlink" href="#func_with_args.returns" title="Permanent link">&para;</a></h4>
 
 
 - `int` <br>
@@ -67,7 +67,7 @@ class SomeClass:
 
 I'm a class!
 
-<strong>Parameters</strong>
+<h4 id="someclass.parameters">Parameters<a class="headerlink" href="#someclass.parameters" title="Permanent link">&para;</a></h4>
 
 
 - __x__ : `float` <br>
@@ -106,7 +106,7 @@ I'm a method!
 
 But I don't do anything.
 
-<strong>Returns</strong>
+<h4 id="some_method.returns">Returns<a class="headerlink" href="#some_method.returns" title="Permanent link">&para;</a></h4>
 
 
 - `None` <br>
@@ -122,7 +122,7 @@ class SomeClass:
 
 Another method.
 
-<strong>Returns</strong>
+<h4 id="method_with_alternative_return_section.returns">Returns<a class="headerlink" href="#method_with_alternative_return_section.returns" title="Permanent link">&para;</a></h4>
 
 
 - A completely arbitrary number. <br>
@@ -138,7 +138,7 @@ class SomeClass:
 
 Another method.
 
-<strong>Returns</strong>
+<h4 id="method_with_alternative_return_section3.returns">Returns<a class="headerlink" href="#method_with_alternative_return_section3.returns" title="Permanent link">&para;</a></h4>
 
 
 - __number__ : `int` <br>
diff --git a/setup.py b/setup.py
index 18cb1548f14..39da8baf280 100644
--- a/setup.py
+++ b/setup.py
@@ -51,6 +51,7 @@
     ),
     install_requires=[
         "torch>=1.6.0,<1.8.0",
+        "torchvision>=0.8.1,<0.9.0",
         "jsonnet>=0.10.0 ; sys.platform != 'win32'",
         "overrides==3.1.0",
         "nltk",
@@ -69,6 +70,8 @@
         "jsonpickle",
         "dataclasses;python_version<'3.7'",
         "filelock>=3.0,<3.1",
+        "lmdb",
+        "more-itertools",
     ],
     entry_points={"console_scripts": ["allennlp=allennlp.__main__:run"]},
     include_package_data=True,
diff --git a/test_fixtures/basic_classifier/common.jsonnet b/test_fixtures/basic_classifier/common.jsonnet
index 6aee2deb469..86572153a9d 100644
--- a/test_fixtures/basic_classifier/common.jsonnet
+++ b/test_fixtures/basic_classifier/common.jsonnet
@@ -1,6 +1,5 @@
 {
     "dataset_reader": {
-        "lazy": false,
         "type": "text_classification_json",
         "tokenizer": {
             "type": "spacy"
diff --git a/test_fixtures/basic_classifier/embedding_with_trainable_is_false/model.tar.gz b/test_fixtures/basic_classifier/embedding_with_trainable_is_false/model.tar.gz
index aeed95f8c16..8c0c69c3f4b 100644
Binary files a/test_fixtures/basic_classifier/embedding_with_trainable_is_false/model.tar.gz and b/test_fixtures/basic_classifier/embedding_with_trainable_is_false/model.tar.gz differ
diff --git a/test_fixtures/basic_classifier/from_archive_serialization/model.tar.gz b/test_fixtures/basic_classifier/from_archive_serialization/model.tar.gz
index d5d36049531..f5e50cf703a 100644
Binary files a/test_fixtures/basic_classifier/from_archive_serialization/model.tar.gz and b/test_fixtures/basic_classifier/from_archive_serialization/model.tar.gz differ
diff --git a/test_fixtures/basic_classifier/serialization/model.tar.gz b/test_fixtures/basic_classifier/serialization/model.tar.gz
index dfa653ee82e..2cd1b3c2b18 100644
Binary files a/test_fixtures/basic_classifier/serialization/model.tar.gz and b/test_fixtures/basic_classifier/serialization/model.tar.gz differ
diff --git a/test_fixtures/data/images/COCO_train2014_000000458752.jpg b/test_fixtures/data/images/COCO_train2014_000000458752.jpg
new file mode 100755
index 00000000000..0a4cfa6758f
Binary files /dev/null and b/test_fixtures/data/images/COCO_train2014_000000458752.jpg differ
diff --git a/tests/commands/evaluate_test.py b/tests/commands/evaluate_test.py
index 338560f5154..8ad4e624df5 100644
--- a/tests/commands/evaluate_test.py
+++ b/tests/commands/evaluate_test.py
@@ -8,7 +8,7 @@
 
 from allennlp.commands.evaluate import evaluate_from_args, Evaluate, evaluate
 from allennlp.common.testing import AllenNlpTestCase
-from allennlp.data.dataloader import TensorDict
+from allennlp.data.data_loaders import TensorDict
 from allennlp.models import Model
 
 
@@ -23,6 +23,9 @@ def __iter__(self) -> Iterator[TensorDict]:
     def __len__(self):
         return len(self._outputs)
 
+    def set_target_device(self, _):
+        pass
+
 
 class DummyModel(Model):
     def __init__(self) -> None:
diff --git a/tests/commands/find_learning_rate_test.py b/tests/commands/find_learning_rate_test.py
index 41ff34fdf96..f33cdb1f924 100644
--- a/tests/commands/find_learning_rate_test.py
+++ b/tests/commands/find_learning_rate_test.py
@@ -5,7 +5,6 @@
 
 from allennlp.common import Params
 from allennlp.data import Vocabulary
-from allennlp.data import DataLoader
 from allennlp.models import Model
 from allennlp.common.checks import ConfigurationError
 from allennlp.common.testing import AllenNlpTestCase, requires_multi_gpu
@@ -16,7 +15,7 @@
     FindLearningRate,
 )
 from allennlp.training import Trainer
-from allennlp.training.util import datasets_from_params
+from allennlp.training.util import data_loaders_from_params
 
 
 def is_matplotlib_installed():
@@ -170,16 +169,20 @@ def setup_method(self):
                 "trainer": {"cuda_device": -1, "num_epochs": 2, "optimizer": "adam"},
             }
         )
-        all_datasets = datasets_from_params(params)
+        all_data_loaders = data_loaders_from_params(params)
         vocab = Vocabulary.from_params(
             params.pop("vocabulary", {}),
-            instances=(instance for dataset in all_datasets.values() for instance in dataset),
+            instances=(
+                instance
+                for data_loader in all_data_loaders.values()
+                for instance in data_loader.iter_instances()
+            ),
         )
         model = Model.from_params(vocab=vocab, params=params.pop("model"))
-        train_data = all_datasets["train"]
-        train_data.index_with(vocab)
 
-        data_loader = DataLoader.from_params(dataset=train_data, params=params.pop("data_loader"))
+        data_loader = all_data_loaders["train"]
+        data_loader.index_with(vocab)
+
         trainer_params = params.pop("trainer")
         serialization_dir = os.path.join(self.TEST_DIR, "test_search_learning_rate")
 
@@ -187,7 +190,6 @@ def setup_method(self):
             model=model,
             serialization_dir=serialization_dir,
             data_loader=data_loader,
-            train_data=train_data,
             params=trainer_params,
             validation_data=None,
             validation_iterator=None,
diff --git a/tests/commands/train_test.py b/tests/commands/train_test.py
index 75f4515390a..3d2910740f5 100644
--- a/tests/commands/train_test.py
+++ b/tests/commands/train_test.py
@@ -8,7 +8,7 @@
 import re
 import shutil
 from collections import OrderedDict, Counter
-from typing import Iterable, Optional, List, Dict, Any
+from typing import Optional, List, Dict, Any
 
 import pytest
 import torch
@@ -17,11 +17,11 @@
 from allennlp.common import Params
 from allennlp.common.checks import ConfigurationError
 from allennlp.common.testing import AllenNlpTestCase, cpu_or_gpu
-from allennlp.data import DatasetReader, Instance, Vocabulary
-from allennlp.data.dataloader import TensorDict
+from allennlp.data import Vocabulary
+from allennlp.data.data_loaders import TensorDict
 from allennlp.models import load_archive, Model
 from allennlp.models.archival import CONFIG_NAME
-from allennlp.training import BatchCallback, GradientDescentTrainer
+from allennlp.training import TrainerCallback, GradientDescentTrainer
 from allennlp.training.learning_rate_schedulers import (
     ExponentialLearningRateScheduler,
     LearningRateScheduler,
@@ -31,9 +31,9 @@
 SEQUENCE_TAGGING_SHARDS_PATH = str(AllenNlpTestCase.FIXTURES_ROOT / "data" / "shards" / "*")
 
 
-@BatchCallback.register("training_data_logger")
-class TrainingDataLoggerBatchCallback(BatchCallback):
-    def __call__(  # type: ignore
+@TrainerCallback.register("training_data_logger")
+class TrainingDataLoggerOnBatchCallback(TrainerCallback):
+    def on_batch(  # type: ignore
         self,
         trainer: "GradientDescentTrainer",
         batch_inputs: List[TensorDict],
@@ -42,7 +42,8 @@ def __call__(  # type: ignore
         epoch: int,
         batch_number: int,
         is_training: bool,
-        is_master: bool,
+        is_primary: bool = True,
+        **kwargs,
     ) -> None:
         if is_training:
             logger = logging.getLogger(__name__)
@@ -54,9 +55,9 @@ def __call__(  # type: ignore
 _seen_training_devices = set()
 
 
-@BatchCallback.register("training_device_logger")
-class TrainingDeviceLoggerBatchCallback(BatchCallback):
-    def __call__(  # type: ignore
+@TrainerCallback.register("training_device_logger")
+class TrainingDeviceLoggerOnBatchCallback(TrainerCallback):
+    def on_batch(  # type: ignore
         self,
         trainer: "GradientDescentTrainer",
         batch_inputs: List[TensorDict],
@@ -65,7 +66,8 @@ def __call__(  # type: ignore
         epoch: int,
         batch_number: int,
         is_training: bool,
-        is_master: bool,
+        is_primary: bool = True,
+        **kwargs,
     ) -> None:
         global _seen_training_devices
         for tensor in trainer.model.parameters():
@@ -141,7 +143,7 @@ def test_detect_gpu(self):
         import copy
 
         params = copy.deepcopy(self.DEFAULT_PARAMS)
-        params["trainer"]["batch_callbacks"] = ["training_device_logger"]
+        params["trainer"]["callbacks"] = ["training_device_logger"]
 
         global _seen_training_devices
         _seen_training_devices.clear()
@@ -158,7 +160,7 @@ def test_force_gpu(self):
         import copy
 
         params = copy.deepcopy(self.DEFAULT_PARAMS)
-        params["trainer"]["batch_callbacks"] = ["training_device_logger"]
+        params["trainer"]["callbacks"] = ["training_device_logger"]
         params["trainer"]["cuda_device"] = 0
 
         global _seen_training_devices
@@ -177,7 +179,7 @@ def test_force_cpu(self):
         import copy
 
         params = copy.deepcopy(self.DEFAULT_PARAMS)
-        params["trainer"]["batch_callbacks"] = ["training_device_logger"]
+        params["trainer"]["callbacks"] = ["training_device_logger"]
         params["trainer"]["cuda_device"] = -1
 
         global _seen_training_devices
@@ -236,8 +238,8 @@ def test_train_model_distributed(self):
         assert load_archive(out_dir).model
 
     @cpu_or_gpu
-    @pytest.mark.parametrize("lazy", [True, False])
-    def test_train_model_distributed_with_sharded_reader(self, lazy):
+    @pytest.mark.parametrize("max_instances_in_memory", [None, 10])
+    def test_train_model_distributed_with_sharded_reader(self, max_instances_in_memory):
         if torch.cuda.device_count() >= 2:
             devices = [0, 1]
         else:
@@ -252,14 +254,13 @@ def test_train_model_distributed_with_sharded_reader(self, lazy):
                     },
                     "encoder": {"type": "lstm", "input_size": 5, "hidden_size": 7, "num_layers": 2},
                 },
-                "dataset_reader": {
-                    "type": "sharded",
-                    "base_reader": {"type": "sequence_tagging"},
-                    "lazy": lazy,
-                },
+                "dataset_reader": {"type": "sharded", "base_reader": {"type": "sequence_tagging"}},
                 "train_data_path": SEQUENCE_TAGGING_SHARDS_PATH,
                 "validation_data_path": SEQUENCE_TAGGING_SHARDS_PATH,
-                "data_loader": {"batch_size": 2},
+                "data_loader": {
+                    "batch_size": 1,
+                    "max_instances_in_memory": max_instances_in_memory,
+                },
                 "trainer": {"num_epochs": 2, "optimizer": "adam"},
                 "distributed": {"cuda_devices": devices},
             }
@@ -325,8 +326,8 @@ def test_train_model_distributed_with_sharded_reader(self, lazy):
             assert validation_complete in worker1_log
 
     @cpu_or_gpu
-    @pytest.mark.parametrize("lazy", [True, False])
-    def test_train_model_distributed_without_sharded_reader(self, lazy: bool):
+    @pytest.mark.parametrize("max_instances_in_memory", [None, 10])
+    def test_train_model_distributed_without_sharded_reader(self, max_instances_in_memory):
         if torch.cuda.device_count() >= 2:
             devices = [0, 1]
         else:
@@ -342,16 +343,17 @@ def test_train_model_distributed_without_sharded_reader(self, lazy: bool):
                     },
                     "encoder": {"type": "lstm", "input_size": 5, "hidden_size": 7, "num_layers": 2},
                 },
-                "dataset_reader": {"type": "sequence_tagging", "lazy": lazy},
+                "dataset_reader": {"type": "sequence_tagging"},
                 "train_data_path": SEQUENCE_TAGGING_DATA_PATH,
                 "validation_data_path": SEQUENCE_TAGGING_DATA_PATH,
-                "data_loader": {"batch_size": 1},
+                "data_loader": {
+                    "batch_size": 1,
+                    "max_instances_in_memory": max_instances_in_memory,
+                },
                 "trainer": {
                     "num_epochs": num_epochs,
                     "optimizer": "adam",
-                    "batch_callbacks": [
-                        "tests.commands.train_test.TrainingDataLoggerBatchCallback"
-                    ],
+                    "callbacks": ["tests.commands.train_test.TrainingDataLoggerOnBatchCallback"],
                 },
                 "distributed": {"cuda_devices": devices},
             }
@@ -527,9 +529,9 @@ def __init__(self, optimizer: torch.optim.Optimizer, num_steps_per_epoch: int):
 
         batch_callback_counter = 0
 
-        @BatchCallback.register("counter")
-        class CounterBatchCallback(BatchCallback):
-            def __call__(
+        @TrainerCallback.register("counter")
+        class CounterOnBatchCallback(TrainerCallback):
+            def on_batch(
                 self,
                 trainer: GradientDescentTrainer,
                 batch_inputs: List[List[TensorDict]],
@@ -538,7 +540,8 @@ def __call__(
                 epoch: int,
                 batch_number: int,
                 is_training: bool,
-                is_master: bool,
+                is_primary: bool = True,
+                **kwargs,
             ) -> None:
                 nonlocal batch_callback_counter
                 if is_training:
@@ -563,7 +566,7 @@ def __call__(
                     "num_epochs": number_of_epochs,
                     "optimizer": "adam",
                     "learning_rate_scheduler": {"type": "mock"},
-                    "batch_callbacks": ["counter"],
+                    "callbacks": ["counter"],
                 },
             }
         )
@@ -640,63 +643,6 @@ def test_train_can_fine_tune_model_from_archive(self):
         # parameters such that the vocab should have been extended.
         assert train_loop.model.vocab.get_vocab_size() > model.vocab.get_vocab_size()
 
-
-@DatasetReader.register("lazy-test")
-class LazyFakeReader(DatasetReader):
-    def __init__(self) -> None:
-        super().__init__(lazy=True)
-        self.reader = DatasetReader.from_params(Params({"type": "sequence_tagging", "lazy": True}))
-
-    def _read(self, file_path: str) -> Iterable[Instance]:
-        """
-        Reads some data from the `file_path` and returns the instances.
-        """
-        return self.reader.read(file_path)
-
-
-class TestTrainOnLazyDataset(AllenNlpTestCase):
-    def test_train_model(self):
-        params = Params(
-            {
-                "model": {
-                    "type": "simple_tagger",
-                    "text_field_embedder": {
-                        "token_embedders": {"tokens": {"type": "embedding", "embedding_dim": 5}}
-                    },
-                    "encoder": {"type": "lstm", "input_size": 5, "hidden_size": 7, "num_layers": 2},
-                },
-                "dataset_reader": {"type": "lazy-test"},
-                "train_data_path": SEQUENCE_TAGGING_DATA_PATH,
-                "validation_data_path": SEQUENCE_TAGGING_DATA_PATH,
-                "data_loader": {"batch_size": 2},
-                "trainer": {"num_epochs": 2, "optimizer": "adam"},
-            }
-        )
-
-        train_model(params, serialization_dir=os.path.join(self.TEST_DIR, "train_lazy_model"))
-
-    def test_train_with_test_set(self):
-        params = Params(
-            {
-                "model": {
-                    "type": "simple_tagger",
-                    "text_field_embedder": {
-                        "token_embedders": {"tokens": {"type": "embedding", "embedding_dim": 5}}
-                    },
-                    "encoder": {"type": "lstm", "input_size": 5, "hidden_size": 7, "num_layers": 2},
-                },
-                "dataset_reader": {"type": "lazy-test"},
-                "train_data_path": SEQUENCE_TAGGING_DATA_PATH,
-                "test_data_path": SEQUENCE_TAGGING_DATA_PATH,
-                "validation_data_path": SEQUENCE_TAGGING_DATA_PATH,
-                "evaluate_on_test": True,
-                "data_loader": {"batch_size": 2},
-                "trainer": {"num_epochs": 2, "optimizer": "adam"},
-            }
-        )
-
-        train_model(params, serialization_dir=os.path.join(self.TEST_DIR, "lazy_test_set"))
-
     def test_train_nograd_regex(self):
         params_get = lambda: Params(
             {
diff --git a/tests/common/file_utils_test.py b/tests/common/file_utils_test.py
index 550b30e111e..b99636c4f14 100644
--- a/tests/common/file_utils_test.py
+++ b/tests/common/file_utils_test.py
@@ -8,6 +8,7 @@
 from filelock import Timeout
 import pytest
 import responses
+import torch
 from requests.exceptions import ConnectionError
 
 from allennlp.common import file_utils
@@ -24,6 +25,8 @@
     _find_entries,
     inspect_cache,
     remove_cache_entries,
+    LocalCacheResource,
+    TensorCache,
 )
 from allennlp.common.testing import AllenNlpTestCase
 
@@ -492,3 +495,55 @@ def test_temp_file_removed_on_error(self):
                 raise IOError("I made this up")
         assert not os.path.exists(handle.name)
         assert not os.path.exists(cache_filename)
+
+
+class TestLocalCacheResource(AllenNlpTestCase):
+    def test_local_cache_resource(self):
+        with LocalCacheResource("some-computation", "version-1", cache_dir=self.TEST_DIR) as cache:
+            assert not cache.cached()
+
+            with cache.writer() as w:
+                json.dump({"a": 1}, w)
+
+        with LocalCacheResource("some-computation", "version-1", cache_dir=self.TEST_DIR) as cache:
+            assert cache.cached()
+
+            with cache.reader() as r:
+                data = json.load(r)
+
+            assert data["a"] == 1
+
+
+class TestTensorCace(AllenNlpTestCase):
+    def test_tensor_cache(self):
+        cache = TensorCache(self.TEST_DIR / "cache")
+        assert not cache.read_only
+
+        # Insert some stuff into the cache.
+        cache["a"] = torch.tensor([1, 2, 3])
+
+        # Close cache.
+        del cache
+
+        # Now let's open another one in read-only mode.
+        cache = TensorCache(self.TEST_DIR / "cache", read_only=True)
+        assert cache.read_only
+
+        # If we try to write we should get a ValueError
+        with pytest.raises(ValueError, match="cannot write"):
+            cache["b"] = torch.tensor([1, 2, 3])
+
+        # But we should be able to retrieve from the cache.
+        assert cache["a"].shape == (3,)
+
+        # Close this one.
+        del cache
+
+        # Now we're going to tell the OS to make the cache file read-only.
+        os.chmod(self.TEST_DIR / "cache", 0o444)
+        os.chmod(self.TEST_DIR / "cache-lock", 0o444)
+
+        # This time when we open the cache, it should automatically be set to read-only.
+        with pytest.warns(UserWarning, match="cache will be read-only"):
+            cache = TensorCache(self.TEST_DIR / "cache")
+            assert cache.read_only
diff --git a/tests/common/from_params_test.py b/tests/common/from_params_test.py
index 1667a563fc6..dfd479ffe3d 100644
--- a/tests/common/from_params_test.py
+++ b/tests/common/from_params_test.py
@@ -595,40 +595,32 @@ def test_transferring_of_modules_ensures_type_consistency(self):
             Model.from_params(vocab=trained_model.vocab, params=Params(model_params))
 
     def test_bare_string_params(self):
-        dataset = [1]
+        reader = DatasetReader.from_params(Params({"type": "text_classification_json"}))
 
         class TestLoader(Registrable):
             @classmethod
             def from_partial_objects(cls, data_loader: Lazy[DataLoader]) -> DataLoader:
-                return data_loader.construct(dataset=dataset)
+                return data_loader.construct(
+                    reader=reader,
+                    data_path=str(
+                        self.FIXTURES_ROOT
+                        / "data"
+                        / "text_classification_json"
+                        / "imdb_corpus2.jsonl"
+                    ),
+                )
 
         TestLoader.register("test", constructor="from_partial_objects")(TestLoader)
 
         data_loader = TestLoader.from_params(
-            Params(
-                {
-                    "type": "test",
-                    "data_loader": {
-                        "batch_sampler": {
-                            "type": "basic",
-                            "batch_size": 2,
-                            "drop_last": True,
-                            "sampler": "random",
-                        }
-                    },
-                }
-            )
+            Params({"type": "test", "data_loader": {"batch_size": 2}})
         )
-        assert data_loader.batch_sampler.sampler.__class__.__name__ == "RandomSampler"
-        assert data_loader.batch_sampler.sampler.data_source is dataset
+        assert data_loader.batch_size == 2
 
     def test_kwargs_are_passed_to_superclass(self):
-        params = Params(
-            {"type": "text_classification_json", "lazy": True, "cache_directory": "tmp"}
-        )
+        params = Params({"type": "text_classification_json", "max_instances": 50})
         reader = DatasetReader.from_params(params)
-        assert reader.lazy is True
-        assert str(reader._cache_directory) == "tmp"
+        assert reader.max_instances == 50
 
     def test_kwargs_with_multiple_inheritance(self):
         # Basic idea: have two identical classes, differing only in the order of their multiple
diff --git a/tests/common/registrable_test.py b/tests/common/registrable_test.py
index 571552d2e5b..ff92b7f3861 100644
--- a/tests/common/registrable_test.py
+++ b/tests/common/registrable_test.py
@@ -8,7 +8,6 @@
 from allennlp.common.testing import AllenNlpTestCase
 from allennlp.common.util import push_python_path
 from allennlp.data.dataset_readers.dataset_reader import DatasetReader
-from allennlp.data.samplers import Sampler, BatchSampler
 from allennlp.data.token_indexers.token_indexer import TokenIndexer
 from allennlp.data.tokenizers.tokenizer import Tokenizer
 from allennlp.modules.text_field_embedders.text_field_embedder import TextFieldEmbedder
@@ -70,13 +69,6 @@ class FakeAlternate2(base_class):
 
         del Registrable._registry[base_class]["fake"]
 
-    # TODO(mattg): maybe move all of these into tests for the base class?
-
-    def test_registry_has_builtin_samplers(self):
-        assert Sampler.by_name("random").__name__ == "RandomSampler"
-        assert Sampler.by_name("sequential").__name__ == "SequentialSampler"
-        assert BatchSampler.by_name("bucket").__name__ == "BucketBatchSampler"
-
     def test_registry_has_builtin_tokenizers(self):
         assert Tokenizer.by_name("spacy").__name__ == "SpacyTokenizer"
         assert Tokenizer.by_name("character").__name__ == "CharacterTokenizer"
diff --git a/tests/common/util_test.py b/tests/common/util_test.py
index 6633236af5b..d6bec1c07fd 100644
--- a/tests/common/util_test.py
+++ b/tests/common/util_test.py
@@ -136,6 +136,27 @@ def create_backward_token_test_case(backward_token):
             actual = util.sanitize_ptb_tokenized_string(ptb_string)
             assert actual == expected
 
+    def test_cycle_iterator_function(self):
+        global cycle_iterator_function_calls
+        cycle_iterator_function_calls = 0
+
+        def one_and_two():
+            global cycle_iterator_function_calls
+            cycle_iterator_function_calls += 1
+            for i in [1, 2]:
+                yield i
+
+        iterator = iter(util.cycle_iterator_function(one_and_two))
+
+        # Function calls should be lazy.
+        assert cycle_iterator_function_calls == 0
+
+        values = [next(iterator) for _ in range(5)]
+        assert values == [1, 2, 1, 2, 1]
+        # This is the difference between cycle_iterator_function and itertools.cycle.  We'd only see
+        # 1 here with itertools.cycle.
+        assert cycle_iterator_function_calls == 3
+
 
 @pytest.mark.parametrize(
     "size, result",
diff --git a/tests/data/data_loaders/__init__.py b/tests/data/data_loaders/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/tests/data/data_loaders/multiprocess_data_loader_test.py b/tests/data/data_loaders/multiprocess_data_loader_test.py
new file mode 100644
index 00000000000..e0197edee71
--- /dev/null
+++ b/tests/data/data_loaders/multiprocess_data_loader_test.py
@@ -0,0 +1,201 @@
+from typing import List, Iterable, Dict
+
+import torch
+import pytest
+from allennlp.common.testing import requires_gpu
+from allennlp.data.instance import Instance
+from allennlp.data.dataset_readers import DatasetReader
+from allennlp.data.data_loaders import MultiProcessDataLoader, WorkerError
+from allennlp.data.fields import Field, TextField, MetadataField, TensorField
+from allennlp.data.tokenizers import PretrainedTransformerTokenizer
+from allennlp.data.token_indexers import PretrainedTransformerIndexer
+from allennlp.data.vocabulary import Vocabulary
+
+
+class MockDatasetReader(DatasetReader):
+    """
+    We'll use this mock dataset reader for most of the tests.
+
+    It utilizes a transformers tokenizer, since historically we've seen deadlocking
+    issues when using these within subprocesses. So these tests also serve as
+    regression tests against those issues.
+
+    And unlike `MockOldDatasetReader` below, it implements all of the new API,
+    specifically the `apply_token_indexers` method, so that it can be used
+    with num_workers > 0.
+    """
+
+    NUM_INSTANCES = 100
+
+    def __init__(self, model: str = "epwalsh/bert-xsmall-dummy", **kwargs) -> None:
+        super().__init__(
+            manual_distributed_sharding=True, manual_multiprocess_sharding=True, **kwargs
+        )
+        self.tokenizer = PretrainedTransformerTokenizer(model)
+        self.token_indexers = {"tokens": PretrainedTransformerIndexer(model)}
+
+    def _read(self, file_path: str):
+        for i in self.shard_iterable(range(self.NUM_INSTANCES)):
+            source = f"Hi there, I'm the {i}th instance"
+            target = f"Hello, {i}th instance!"
+            yield self.text_to_instance(i, source, target)
+
+    def text_to_instance(self, index: int, source: str, target: str = None) -> Instance:  # type: ignore
+        fields: Dict[str, Field] = {}
+        fields["source"] = TextField(self.tokenizer.tokenize(source))
+        fields["index"] = MetadataField(index)  # type: ignore
+        # It's important to have tests that use a tensor field since sending tensors
+        # between processes has a lot of pitfalls.
+        fields["tensor"] = TensorField(torch.tensor([1, 2, 3]))
+        if target is not None:
+            fields["target"] = TextField(self.tokenizer.tokenize(target))
+        return Instance(fields)  # type: ignore
+
+    def apply_token_indexers(self, instance: Instance) -> None:
+        instance.fields["source"].token_indexers = self.token_indexers  # type: ignore
+        if "target" in instance.fields:
+            instance.fields["target"].token_indexers = self.token_indexers  # type: ignore
+
+
+class MockOldDatasetReader(DatasetReader):
+    def __init__(self, model: str = "epwalsh/bert-xsmall-dummy", **kwargs) -> None:
+        super().__init__(**kwargs)
+        self.tokenizer = PretrainedTransformerTokenizer(model)
+        self.token_indexers = {"tokens": PretrainedTransformerIndexer(model)}
+
+    def _read(self, file_path: str):
+        for i in range(10):
+            source = f"Hi there, I'm the {i}th instance"
+            target = f"Hello, {i}th instance!"
+            yield self.text_to_instance(source, target)
+
+    def text_to_instance(self, source: str, target: str = None) -> Instance:  # type: ignore
+        fields = {}
+        fields["source"] = TextField(self.tokenizer.tokenize(source), self.token_indexers)  # type: ignore
+        if target is not None:
+            fields["target"] = TextField(self.tokenizer.tokenize(target), self.token_indexers)  # type: ignore
+        return Instance(fields)  # type: ignore
+
+
+@pytest.mark.parametrize("max_instances_in_memory", (None, 10))
+def test_error_raised_when_text_fields_contain_token_indexers(max_instances_in_memory):
+    """
+    This tests that the MultiProcessDataLoader raises an error when num_workers > 0
+    but the dataset reader doesn't implement apply_token_indexers().
+
+    It also tests that errors raised within a worker process are propogated upwards
+    to the main process, and that when that happens, all workers will be successfully
+    killed.
+    """
+
+    with pytest.raises(WorkerError, match="Make sure your dataset reader's text_to_instance()"):
+        loader = MultiProcessDataLoader(
+            MockOldDatasetReader(),
+            "this-path-doesn't-matter",
+            num_workers=2,
+            max_instances_in_memory=max_instances_in_memory,
+            batch_size=1,
+        )
+        list(loader.iter_instances())
+
+
+@pytest.mark.parametrize(
+    "options",
+    [
+        dict(max_instances_in_memory=10, num_workers=2, batch_size=1),
+        dict(num_workers=2, batch_size=1),
+        dict(max_instances_in_memory=10, num_workers=2, start_method="spawn", batch_size=1),
+        dict(num_workers=2, start_method="spawn", batch_size=1),
+        dict(max_instances_in_memory=10, num_workers=0, batch_size=1),
+        dict(num_workers=0, batch_size=1),
+    ],
+    ids=str,
+)
+def test_multiprocess_data_loader(options):
+    reader = MockDatasetReader()
+    data_path = "this doesn't matter"
+
+    loader = MultiProcessDataLoader(reader=reader, data_path=data_path, **options)
+    if not options.get("max_instances_in_memory"):
+        # Instances should be loaded immediately if max_instances_in_memory is None.
+        assert loader._instances
+
+    instances: Iterable[Instance] = loader.iter_instances()
+    # This should be a generator.
+    assert not isinstance(instances, (list, tuple))
+    instances = list(instances)
+    assert len(instances) == MockDatasetReader.NUM_INSTANCES
+
+    # Now build vocab.
+    vocab = Vocabulary.from_instances(instances)
+
+    # Before indexing the loader, trying to iterate through batches will raise an error.
+    with pytest.raises(ValueError, match="Did you forget to call DataLoader.index_with"):
+        list(loader)
+
+    loader.index_with(vocab)
+
+    # Run through a couple epochs to make sure we collect all of the instances.
+    for epoch in range(2):
+        indices: List[int] = []
+        for batch in loader:
+            for index in batch["index"]:
+                indices.append(index)  # type: ignore
+        # Ensure no duplicates.
+        assert len(indices) == len(set(indices)), indices
+        # Ensure all collected.
+        assert len(indices) == MockDatasetReader.NUM_INSTANCES, epoch
+
+
+def test_drop_last():
+    """
+    Ensures that the `drop_last` option is respected.
+    """
+    loader = MultiProcessDataLoader(MockDatasetReader(), "some path", batch_size=16, drop_last=True)
+    vocab = Vocabulary.from_instances(loader.iter_instances())
+    loader.index_with(vocab)
+
+    # Should still load all instances. `drop_last` only affects batches.
+    assert len(list(loader.iter_instances())) == MockDatasetReader.NUM_INSTANCES
+
+    # Just here because the assertions below depend on the exact value of NUM_INSTANCES.
+    assert MockDatasetReader.NUM_INSTANCES == 100
+    batches = list(loader)
+    for batch in batches:
+        assert len(batch["index"]) == 16
+    assert len(batches) == 6
+
+
+def test_batches_per_epoch():
+    loader = MultiProcessDataLoader(
+        MockDatasetReader(), "some path", batch_size=4, batches_per_epoch=10
+    )
+    vocab = Vocabulary.from_instances(loader.iter_instances())
+    loader.index_with(vocab)
+
+    assert len(loader) == 10
+    assert len(list(loader)) == 10
+
+
+@pytest.mark.parametrize(
+    "options",
+    [
+        dict(num_workers=0, batch_size=2),
+        dict(num_workers=1, batch_size=2),
+        dict(num_workers=1, batch_size=2, start_method="spawn"),
+    ],
+    ids=str,
+)
+@requires_gpu
+def test_load_to_cuda(options):
+    reader = MockDatasetReader()
+    loader = MultiProcessDataLoader(
+        reader=reader,
+        data_path="this doens't matter",
+        cuda_device=0,
+        **options,
+    )
+    vocab = Vocabulary.from_instances(loader.iter_instances())
+    loader.index_with(vocab)
+    for batch in loader:
+        assert batch["tensor"].device == torch.device("cuda:0")
diff --git a/tests/data/data_loaders/multitask_data_loader_test.py b/tests/data/data_loaders/multitask_data_loader_test.py
new file mode 100644
index 00000000000..35b28dfb721
--- /dev/null
+++ b/tests/data/data_loaders/multitask_data_loader_test.py
@@ -0,0 +1,81 @@
+import itertools
+
+import pytest
+import torch
+
+from allennlp.common.util import cycle_iterator_function
+from allennlp.data import DatasetReader, Instance, Vocabulary
+from allennlp.data.fields import LabelField
+from allennlp.data.dataset_readers import MultiTaskDatasetReader
+from allennlp.data.data_loaders.multitask_data_loader import MultiTaskDataLoader
+from allennlp.data.data_loaders.multitask_scheduler import RoundRobinScheduler
+from allennlp.data.data_loaders.multitask_epoch_sampler import UniformSampler, WeightedSampler
+
+
+class FakeDatasetReaderA(DatasetReader):
+    def _read(self, file_path: str):
+        return itertools.islice(
+            cycle_iterator_function(lambda: [Instance({"label": LabelField("A")})]), 100
+        )
+
+
+class FakeDatasetReaderB(DatasetReader):
+    def _read(self, file_path: str):
+        return itertools.islice(
+            cycle_iterator_function(lambda: [Instance({"label": LabelField("B")})]), 100
+        )
+
+
+class MultiTaskDataLoaderTest:
+    def test_loading(self):
+        reader = MultiTaskDatasetReader(
+            readers={"a": FakeDatasetReaderA(), "b": FakeDatasetReaderB()}
+        )
+        data_path = {"a": "ignored", "b": "ignored"}
+        scheduler = RoundRobinScheduler(batch_size=4)
+        sampler = UniformSampler()
+        loader = MultiTaskDataLoader(
+            reader=reader,
+            data_path=data_path,
+            scheduler=scheduler,
+            sampler=sampler,
+            instances_per_epoch=8,
+            max_instances_in_memory={"a": 10, "b": 10},
+        )
+        vocab = Vocabulary()
+        vocab.add_tokens_to_namespace(["A", "B"], "labels")
+        loader.index_with(vocab)
+        iterator = iter(loader)
+        batch = next(iterator)
+        assert torch.all(batch["label"] == torch.IntTensor([0, 1, 0, 1]))
+        batch = next(iterator)
+        assert torch.all(batch["label"] == torch.IntTensor([0, 1, 0, 1]))
+        with pytest.raises(StopIteration):
+            next(iterator)
+
+    def test_loading_with_sampler(self):
+        reader = MultiTaskDatasetReader(
+            readers={"a": FakeDatasetReaderA(), "b": FakeDatasetReaderB()}
+        )
+        data_path = {"a": "ignored", "b": "ignored"}
+        scheduler = RoundRobinScheduler(batch_size=4)
+        sampler = WeightedSampler({"a": 1, "b": 2})
+        loader = MultiTaskDataLoader(
+            reader=reader,
+            data_path=data_path,
+            scheduler=scheduler,
+            sampler=sampler,
+            instances_per_epoch=9,
+        )
+        vocab = Vocabulary()
+        vocab.add_tokens_to_namespace(["A", "B"], "labels")
+        loader.index_with(vocab)
+        iterator = iter(loader)
+        batch = next(iterator)
+        assert torch.all(batch["label"] == torch.IntTensor([0, 1, 0, 1]))
+        batch = next(iterator)
+        assert torch.all(batch["label"] == torch.IntTensor([0, 1, 1, 1]))
+        batch = next(iterator)
+        assert torch.all(batch["label"] == torch.IntTensor([1]))
+        with pytest.raises(StopIteration):
+            next(iterator)
diff --git a/tests/data/data_loaders/multitask_scheduler_test.py b/tests/data/data_loaders/multitask_scheduler_test.py
new file mode 100644
index 00000000000..40f05d9e9a7
--- /dev/null
+++ b/tests/data/data_loaders/multitask_scheduler_test.py
@@ -0,0 +1,35 @@
+from allennlp.data.data_loaders.multitask_scheduler import (
+    RoundRobinScheduler,
+    HomogeneousRoundRobinScheduler,
+)
+
+
+class RoundRobinSchedulerTest:
+    def test_order_instances(self):
+        scheduler = RoundRobinScheduler(batch_size=4)
+        epoch_instances = {
+            "a": [1] * 5,
+            "b": [2] * 3,
+        }
+        batches = scheduler.batch_instances(epoch_instances)
+        assert list(batches) == [[1, 2, 1, 2], [1, 2, 1, 1]]
+
+
+class HomogeneousRoundRobinSchedulerTest:
+    def test_order_instances(self):
+        scheduler = HomogeneousRoundRobinScheduler({"a": 2, "b": 3})
+        epoch_instances = {
+            "a": [1] * 9,
+            "b": [2] * 9,
+        }
+        flattened = scheduler.batch_instances(epoch_instances)
+        assert list(flattened) == [
+            [1, 1],
+            [2, 2, 2],
+            [1, 1],
+            [2, 2, 2],
+            [1, 1],
+            [2, 2, 2],
+            [1, 1],
+            [1],
+        ]
diff --git a/tests/data/dataloader_test.py b/tests/data/dataloader_test.py
deleted file mode 100644
index cb422f61945..00000000000
--- a/tests/data/dataloader_test.py
+++ /dev/null
@@ -1,46 +0,0 @@
-from typing import Iterable
-
-import pytest
-
-from allennlp.data.fields import LabelField
-from allennlp.data.instance import Instance
-from allennlp.data.dataloader import PyTorchDataLoader
-from allennlp.data.dataset_readers.dataset_reader import DatasetReader
-
-
-@pytest.mark.parametrize("lazy", (True, False))
-def test_loader_uses_all_instances_when_batches_per_epochs_set(lazy):
-    NUM_INSTANCES = 20
-    BATCH_SIZE = 2
-    BATCHES_PER_EPOCH = 3
-    EPOCHS = 4
-
-    class FakeDatasetReader(DatasetReader):
-        def _read(self, filename: str) -> Iterable[Instance]:
-            for i in range(NUM_INSTANCES):
-                yield Instance({"index": LabelField(i, skip_indexing=True)})
-
-    reader = FakeDatasetReader(lazy=lazy)
-    dataset = reader.read("blah")
-
-    loader = PyTorchDataLoader(dataset, batch_size=BATCH_SIZE, batches_per_epoch=BATCHES_PER_EPOCH)
-    epoch_batches = []
-    for epoch in range(EPOCHS):
-        batches = []
-        for batch in loader:
-            instances = []
-            for index in batch["index"]:
-                instances.append(index)
-            batches.append(instances)
-        epoch_batches.append(batches)
-
-    assert epoch_batches == [
-        # Epoch 0.
-        [[0, 1], [2, 3], [4, 5]],
-        # Epoch 1.
-        [[6, 7], [8, 9], [10, 11]],
-        # Epoch 2.
-        [[12, 13], [14, 15], [16, 17]],
-        # Epoch 3.
-        [[18, 19], [0, 1], [2, 3]],
-    ]
diff --git a/tests/data/dataset_readers/babi_reader_test.py b/tests/data/dataset_readers/babi_reader_test.py
index 3fcf244e652..687f548ed10 100644
--- a/tests/data/dataset_readers/babi_reader_test.py
+++ b/tests/data/dataset_readers/babi_reader_test.py
@@ -1,18 +1,15 @@
 import pytest
 
 from allennlp.common import Params
-from allennlp.common.util import ensure_list
 from allennlp.data.dataset_readers import BabiReader
 from allennlp.common.testing import AllenNlpTestCase
 
 
 class TestBAbIReader:
-    @pytest.mark.parametrize(
-        "keep_sentences, lazy", [(False, False), (False, True), (True, False), (True, True)]
-    )
-    def test_read_from_file(self, keep_sentences, lazy):
-        reader = BabiReader(keep_sentences=keep_sentences, lazy=lazy)
-        instances = ensure_list(reader.read(AllenNlpTestCase.FIXTURES_ROOT / "data" / "babi.txt"))
+    @pytest.mark.parametrize("keep_sentences", [False, True])
+    def test_read_from_file(self, keep_sentences):
+        reader = BabiReader(keep_sentences=keep_sentences)
+        instances = list(reader.read(AllenNlpTestCase.FIXTURES_ROOT / "data" / "babi.txt"))
         assert len(instances) == 8
 
         if keep_sentences:
diff --git a/tests/data/dataset_readers/dataset_reader_test.py b/tests/data/dataset_readers/dataset_reader_test.py
index 22eab7abaeb..aedebefded3 100644
--- a/tests/data/dataset_readers/dataset_reader_test.py
+++ b/tests/data/dataset_readers/dataset_reader_test.py
@@ -1,287 +1,85 @@
-from collections import deque
-import os
-import shutil
-from typing import Optional, NamedTuple, List
+from itertools import islice
+from typing import Optional, List, Set
 
-from filelock import FileLock
 import pytest
 import torch.distributed as dist
 
-from allennlp.common.testing import AllenNlpTestCase
 from allennlp.common import util as common_util
-from allennlp.common.checks import ConfigurationError
 from allennlp.data import Instance
-from allennlp.data.dataloader import PyTorchDataLoader
 from allennlp.data.dataset_readers import (
-    dataset_reader,
     DatasetReader,
-    TextClassificationJsonReader,
+    WorkerInfo,
 )
-from allennlp.data.dataset_readers.dataset_reader import AllennlpLazyDataset
 from allennlp.data.fields import LabelField
 
 
-def mock_collate_fn(item):
-    return item[0]
+TOTAL_INSTANCES = 100
 
 
-class TestDatasetReader(AllenNlpTestCase):
-    def setup_method(self):
-        super().setup_method()
-        self.cache_directory = str(AllenNlpTestCase.FIXTURES_ROOT / "data_cache" / "with_prefix")
-
-    def teardown_method(self):
-        super().teardown_method()
-        if os.path.exists(self.cache_directory):
-            shutil.rmtree(self.cache_directory)
-
-    def test_lazy_dataset_can_be_iterated_through_multiple_times(self):
-        data_file = (
-            AllenNlpTestCase.FIXTURES_ROOT
-            / "data"
-            / "text_classification_json"
-            / "imdb_corpus.jsonl"
-        )
-        reader = TextClassificationJsonReader(lazy=True)
-        instances = reader.read(data_file)
-        assert isinstance(instances, AllennlpLazyDataset)
-
-        first_pass_instances = list(instances)
-        assert len(first_pass_instances) > 2
-        second_pass_instances = list(instances)
-        assert first_pass_instances == second_pass_instances
-
-    def test_read_only_creates_cache_file_once(self):
-        data_file = (
-            AllenNlpTestCase.FIXTURES_ROOT
-            / "data"
-            / "text_classification_json"
-            / "imdb_corpus.jsonl"
-        )
-        reader = TextClassificationJsonReader(cache_directory=self.cache_directory)
-        cache_file = reader._get_cache_location_for_file_path(str(data_file))
-
-        # The first read will create the cache.
-        reader.read(data_file)
-        assert os.path.exists(cache_file)
-        with open(cache_file, "r") as in_file:
-            cache_contents = in_file.read()
-        # The second and all subsequent reads should _use_ the cache, not modify it.  I looked
-        # into checking file modification times, but this test will probably be faster than the
-        # granularity of `os.path.getmtime()` (which only returns values in seconds).
-        reader.read(data_file)
-        reader.read(data_file)
-        reader.read(data_file)
-        reader.read(data_file)
-        with open(cache_file, "r") as in_file:
-            final_cache_contents = in_file.read()
-        assert cache_contents == final_cache_contents
-
-    @pytest.mark.parametrize("lazy", (True, False))
-    def test_caching_works_with_lazy_reading(self, caplog, lazy: bool):
-        data_file = (
-            AllenNlpTestCase.FIXTURES_ROOT
-            / "data"
-            / "text_classification_json"
-            / "imdb_corpus.jsonl"
-        )
-        snli_copy_file = str(data_file) + ".copy"
-        shutil.copyfile(data_file, snli_copy_file)
-        reader = TextClassificationJsonReader(lazy=lazy, cache_directory=self.cache_directory)
-        cache_file = reader._get_cache_location_for_file_path(snli_copy_file)
-
-        # The call to read() will give us an _iterator_.  We'll iterate over it multiple times,
-        # and the caching behavior should change as we go.
-        assert not os.path.exists(cache_file)
-        instances = reader.read(snli_copy_file)
-
-        # The first iteration will create the cache
-        first_pass_instances = []
-        for instance in instances:
-            first_pass_instances.append(instance)
-        assert "Caching instances to temp file" in " ".join([rec.message for rec in caplog.records])
-        assert os.path.exists(cache_file)
-
-        # Now we _remove_ the data file, to be sure we're reading from the cache.
-        os.remove(snli_copy_file)
-        caplog.clear()
-        instances = reader.read(snli_copy_file)
-        second_pass_instances = []
-        for instance in instances:
-            second_pass_instances.append(instance)
-        assert "Reading instances from cache" in " ".join([rec.message for rec in caplog.records])
-
-        # We should get the same instances both times.
-        assert len(first_pass_instances) == len(second_pass_instances)
-        for instance, cached_instance in zip(first_pass_instances, second_pass_instances):
-            assert instance.fields == cached_instance.fields
-
-        # And just to be super paranoid, in case the second pass somehow bypassed the cache
-        # because of a bug that's hard to detect, we'll read the
-        # instances from the cache with a non-lazy iterator and make sure they're the same.
-        reader = TextClassificationJsonReader(lazy=False, cache_directory=self.cache_directory)
-        cached_instances = reader.read(snli_copy_file)
-        assert len(first_pass_instances) == len(cached_instances)
-        for instance, cached_instance in zip(first_pass_instances, cached_instances):
-            assert instance.fields == cached_instance.fields
-
-    @pytest.mark.parametrize("lazy", (True, False))
-    def test_caching_skipped_when_lock_not_acquired(self, caplog, lazy: bool):
-        data_file = (
-            AllenNlpTestCase.FIXTURES_ROOT
-            / "data"
-            / "text_classification_json"
-            / "imdb_corpus.jsonl"
-        )
-        reader = TextClassificationJsonReader(lazy=lazy, cache_directory=self.cache_directory)
-        reader.CACHE_FILE_LOCK_TIMEOUT = 1
-        cache_file = reader._get_cache_location_for_file_path(str(data_file))
-
-        with FileLock(cache_file + ".lock"):
-            # Right now we hold the lock on the cache, so the reader shouldn't
-            # be able to write to it. It will wait for 1 second (because that's what
-            # we set the timeout to be), and then just read the instances as normal.
-            caplog.clear()
-            instances = list(reader.read(data_file))
-            assert "Failed to acquire lock" in caplog.text
-            assert instances
-
-        # We didn't write to the cache because we couldn't acquire the file lock.
-        assert not os.path.exists(cache_file)
-
-        # Now we'll write to the cache and then try the same thing again, this
-        # time making sure that we can still successfully read without the cache
-        # when the lock can't be acquired.
-        deque(reader.read(data_file), maxlen=1)
-        assert os.path.exists(cache_file)
-
-        with FileLock(cache_file + ".lock"):
-            # Right now we hold the lock on the cache, so the reader shouldn't
-            # be able to write to it. It will wait for 1 second (because that's what
-            # we set the timeout to be), and then just read the instances as normal.
-            caplog.clear()
-            instances = list(reader.read(data_file))
-            assert "Failed to acquire lock" in caplog.text
-            assert instances
+class MockDatasetReader(DatasetReader):
+    def _read(self, file_path):
+        for i in range(TOTAL_INSTANCES):
+            yield self.text_to_instance(i)
 
-    @pytest.mark.parametrize("lazy", (True, False))
-    def test_caching_skipped_with_distributed_training(self, caplog, monkeypatch, lazy):
-        monkeypatch.setattr(common_util, "is_distributed", lambda: True)
-        monkeypatch.setattr(dist, "get_rank", lambda: 0)
-        monkeypatch.setattr(dist, "get_world_size", lambda: 1)
+    def text_to_instance(self, index: int):  # type: ignore
+        return Instance({"index": LabelField(index, skip_indexing=True)})
 
-        data_file = (
-            AllenNlpTestCase.FIXTURES_ROOT
-            / "data"
-            / "text_classification_json"
-            / "imdb_corpus.jsonl"
-        )
-        reader = TextClassificationJsonReader(lazy=lazy, cache_directory=self.cache_directory)
-        cache_file = reader._get_cache_location_for_file_path(str(data_file))
 
-        deque(reader.read(data_file), maxlen=1)
-        assert not os.path.exists(cache_file)
-        assert "Can't cache data instances when there are multiple processes" in caplog.text
+class MockMmpsDatasetReader(DatasetReader):
+    """
+    Implements manual multi-process sharding (MMPS).
+    """
 
-    def test_caching_with_lazy_reader_in_multi_process_loader(self):
-        data_file = (
-            AllenNlpTestCase.FIXTURES_ROOT
-            / "data"
-            / "text_classification_json"
-            / "imdb_corpus.jsonl"
-        )
-        reader = TextClassificationJsonReader(lazy=True, cache_directory=self.cache_directory)
-        deque(
-            PyTorchDataLoader(reader.read(data_file), collate_fn=mock_collate_fn, num_workers=2),
-            maxlen=0,
-        )
+    def __init__(self, **kwargs) -> None:
+        super().__init__(manual_multiprocess_sharding=True, **kwargs)
 
-        # We shouldn't write to the cache when the data is being loaded from multiple
-        # processes.
-        cache_file = reader._get_cache_location_for_file_path(str(data_file))
-        assert not os.path.exists(cache_file)
+    def _read(self, file_path):
+        start_index = 0
+        step_size = 1
+        worker_info = self.get_worker_info()
+        if worker_info is not None:
+            start_index += worker_info.id
+            step_size *= worker_info.num_workers
+        for i in islice(range(TOTAL_INSTANCES), start_index, None, step_size):
+            yield self.text_to_instance(i)
 
-        # But try again from the main process and we should see the cache file.
-        instances = list(reader.read(data_file))
-        assert instances
-        assert os.path.exists(cache_file)
+    def text_to_instance(self, index: int):  # type: ignore
+        return Instance({"index": LabelField(index, skip_indexing=True)})
 
-        # Reading again from a multi-process loader should read from the cache.
-        new_instances = list(
-            PyTorchDataLoader(reader.read(data_file), collate_fn=mock_collate_fn, num_workers=2)
-        )
-        assert len(instances) == len(new_instances)
 
-    @pytest.mark.parametrize("lazy", (True, False))
-    def test_max_instances(self, lazy):
-        data_file = (
-            AllenNlpTestCase.FIXTURES_ROOT
-            / "data"
-            / "text_classification_json"
-            / "imdb_corpus.jsonl"
-        )
-        reader = TextClassificationJsonReader(max_instances=2, lazy=lazy)
-        instances = reader.read(data_file)
-        instance_count = sum(1 for _ in instances)
-        assert instance_count == 2
+class MockMdsDatasetReader(DatasetReader):
+    """
+    Implements manual distributed sharding (MDS).
+    """
 
-    @pytest.mark.parametrize("num_workers", (0, 1, 2))
-    def test_max_instances_with_multi_process_loader(self, num_workers):
-        data_file = (
-            AllenNlpTestCase.FIXTURES_ROOT
-            / "data"
-            / "text_classification_json"
-            / "imdb_corpus.jsonl"
-        )
-        reader = TextClassificationJsonReader(max_instances=2, lazy=True)
-        instances = list(
-            PyTorchDataLoader(
-                reader.read(data_file), collate_fn=mock_collate_fn, num_workers=num_workers
-            )
-        )
-        assert len(instances) == 2
+    def __init__(self, **kwargs) -> None:
+        super().__init__(manual_distributed_sharding=True, **kwargs)
 
-    @pytest.mark.parametrize("lazy", (True, False))
-    def test_cached_max_instances(self, lazy):
-        data_file = (
-            AllenNlpTestCase.FIXTURES_ROOT
-            / "data"
-            / "text_classification_json"
-            / "imdb_corpus.jsonl"
-        )
+    def _read(self, file_path):
+        start_index = 0
+        step_size = 1
+        if common_util.is_distributed():
+            start_index += dist.get_rank()
+            step_size *= dist.get_world_size()
+        for i in islice(range(TOTAL_INSTANCES), start_index, None, step_size):
+            yield self.text_to_instance(i)
 
-        # If we try reading with max instances, it shouldn't write to the cache.
-        reader = TextClassificationJsonReader(
-            cache_directory=self.cache_directory, lazy=lazy, max_instances=2
-        )
-        instances = list(reader.read(data_file))
-        assert len(instances) == 2
+    def text_to_instance(self, index: int):  # type: ignore
+        return Instance({"index": LabelField(index, skip_indexing=True)})
 
-        cache_file = reader._get_cache_location_for_file_path(str(data_file))
-        assert not os.path.exists(cache_file)
 
-        # Now reading again with no max_instances specified should create the cache.
-        reader = TextClassificationJsonReader(cache_directory=self.cache_directory, lazy=lazy)
-        instances = list(reader.read(data_file))
-        assert len(instances) > 2
-        assert os.path.exists(cache_file)
+class MockMmpdsDatasetReader(DatasetReader):
+    """
+    Implements manual multi-process and distributed sharding (MMPDS).
+    """
 
-        # The second read should only return two instances, even though it's from the cache.
-        reader = TextClassificationJsonReader(
-            cache_directory=self.cache_directory, max_instances=2, lazy=lazy
+    def __init__(self, **kwargs) -> None:
+        super().__init__(
+            manual_distributed_sharding=True, manual_multiprocess_sharding=True, **kwargs
         )
-        instances = list(reader.read(data_file))
-        assert len(instances) == 2
-
-
-class MockWorkerInfo(NamedTuple):
-    id: int
-    num_workers: int
 
-
-class MockDatasetReader(DatasetReader):
     def _read(self, file_path):
-        for i in range(10):
+        for i in self.shard_iterable(range(TOTAL_INSTANCES)):
             yield self.text_to_instance(i)
 
     def text_to_instance(self, index: int):  # type: ignore
@@ -289,94 +87,94 @@ def text_to_instance(self, index: int):  # type: ignore
 
 
 @pytest.mark.parametrize(
-    "node_rank, world_size, worker_id, num_workers, max_instances, expected_result",
+    "world_size, num_workers, max_instances",
     [
-        (None, None, None, None, None, list(range(10))),
-        (None, None, None, None, 5, list(range(5))),
-        (None, None, None, None, 12, list(range(10))),
-        (None, None, 0, 1, None, list(range(10))),
-        (None, None, 0, 2, None, [0, 2, 4, 6, 8]),
-        (None, None, 1, 2, None, [1, 3, 5, 7, 9]),
-        (None, None, 0, 2, 5, [0, 2, 4]),
-        (None, None, 1, 2, 5, [1, 3]),
-        (0, 1, None, None, None, list(range(10))),
-        (0, 2, None, None, None, [0, 2, 4, 6, 8]),
-        (1, 2, None, None, None, [1, 3, 5, 7, 9]),
-        (0, 2, None, None, 5, [0, 2, 4]),
-        (1, 2, None, None, 5, [1, 3]),
-        (0, 2, 0, 2, None, [0, 4, 8]),
-        (0, 2, 1, 2, None, [1, 5, 9]),
-        (1, 2, 0, 2, None, [2, 6]),
-        (1, 2, 1, 2, None, [3, 7]),
-        (0, 2, 0, 2, 5, [0, 4]),
+        (4, 2, None),
+        (4, 2, 67),
+        (4, None, None),
+        (4, None, None),
+        (None, 2, None),
+        (None, 2, 67),
+        (None, None, None),
+        (None, None, 67),
     ],
 )
+@pytest.mark.parametrize(
+    "reader_class",
+    [MockDatasetReader, MockMmpsDatasetReader, MockMdsDatasetReader, MockMmpdsDatasetReader],
+)
 def test_instance_slicing(
     monkeypatch,
-    node_rank: Optional[int],
+    reader_class,
     world_size: Optional[int],
-    worker_id: Optional[int],
     num_workers: Optional[int],
     max_instances: Optional[int],
-    expected_result: List[int],
 ):
-    if node_rank is not None and world_size is not None:
-        monkeypatch.setattr(common_util, "is_distributed", lambda: True)
-        monkeypatch.setattr(dist, "get_rank", lambda: node_rank)
-        monkeypatch.setattr(dist, "get_world_size", lambda: world_size)
-
-    if worker_id is not None and num_workers is not None:
-        monkeypatch.setattr(
-            dataset_reader, "get_worker_info", lambda: MockWorkerInfo(worker_id, num_workers)
+    """
+    Ensure that the intances read by each worker are always unique and the total
+    adds up to `max_instances`.
+    """
+    results: List[Set[int]] = []
+
+    minimum_expected_result_size = max_instances or TOTAL_INSTANCES
+    maximum_expected_result_size = max_instances or TOTAL_INSTANCES
+
+    if world_size is not None and num_workers is not None:
+        minimum_expected_result_size //= world_size
+        minimum_expected_result_size //= num_workers
+        maximum_expected_result_size = minimum_expected_result_size + 1
+        for global_rank in range(world_size):
+            monkeypatch.setattr(common_util, "is_distributed", lambda: True)
+            monkeypatch.setattr(dist, "get_rank", lambda: global_rank)
+            monkeypatch.setattr(dist, "get_world_size", lambda: world_size)
+            for worker_id in range(num_workers):
+                reader = reader_class(max_instances=max_instances)
+                reader._set_worker_info(WorkerInfo(num_workers, worker_id))
+                result = set(
+                    x["index"].label for x in reader.read("the-path-doesnt-matter")  # type: ignore
+                )
+                results.append(result)
+    elif world_size is not None:
+        minimum_expected_result_size //= world_size
+        maximum_expected_result_size = minimum_expected_result_size + 1
+        for global_rank in range(world_size):
+            monkeypatch.setattr(common_util, "is_distributed", lambda: True)
+            monkeypatch.setattr(dist, "get_rank", lambda: global_rank)
+            monkeypatch.setattr(dist, "get_world_size", lambda: world_size)
+            reader = reader_class(max_instances=max_instances)
+            result = set(
+                x["index"].label for x in reader.read("the-path-doesnt-matter")  # type: ignore
+            )
+            results.append(result)
+    elif num_workers is not None:
+        minimum_expected_result_size //= num_workers
+        maximum_expected_result_size = minimum_expected_result_size + 1
+        for worker_id in range(num_workers):
+            reader = reader_class(max_instances=max_instances)
+            reader._set_worker_info(WorkerInfo(num_workers, worker_id))
+            result = set(
+                x["index"].label for x in reader.read("the-path-doesnt-matter")  # type: ignore
+            )
+            results.append(result)
+    else:
+        reader = reader_class(max_instances=max_instances)
+        result = set(
+            x["index"].label for x in reader.read("the-path-doesnt-matter")  # type: ignore
         )
-
-    reader = MockDatasetReader(max_instances=max_instances)
-    result = list((x["index"].label for x in reader.read("the-path-doesnt-matter")))  # type: ignore
-
-    assert result == expected_result
-
-
-class BadLazyReader(DatasetReader):
-    def _read(self, file_path):
-        return [self.text_to_instance(i) for i in range(10)]
-
-    def text_to_instance(self, index: int):  # type: ignore
-        return Instance({"index": LabelField(index, skip_indexing=True)})
-
-
-def test_config_error_when_lazy_reader_returns_list():
-    reader = BadLazyReader(lazy=True)
-    with pytest.raises(ConfigurationError, match="must return a generator"):
-        deque(reader.read("path"), maxlen=0)
-
-
-class BadReaderReadsNothing(DatasetReader):
-    def _read(self, file_path):
-        return []
-
-    def text_to_instance(self, index: int):  # type: ignore
-        return Instance({"index": LabelField(index, skip_indexing=True)})
-
-
-def test_config_error_when_reader_returns_no_instances():
-    reader = BadReaderReadsNothing()
-    with pytest.raises(ConfigurationError, match="No instances were read"):
-        deque(reader.read("path"), maxlen=0)
-
-
-class BadReaderForgetsToSetLazy(DatasetReader):
-    def __init__(self):
-        pass
-
-    def _read(self, file_path):
-        for i in range(10):
-            yield self.text_to_instance(i)
-
-    def text_to_instance(self, index: int):  # type: ignore
-        return Instance({"index": LabelField(index, skip_indexing=True)})
-
-
-def warning_when_reader_has_no_lazy_set():
-    with pytest.warns(UserWarning, match="DatasetReader.lazy is not set"):
-        reader = BadReaderForgetsToSetLazy()
-        reader.read("path")
+        results.append(result)
+
+    # We need to check that all of the result sets are mutually exclusive and that they're
+    # union has size `max_instances`.
+    # Checking that they're mutually exclusive is equivalent to checking that the sum
+    # of the size of each set is equal to the size of the union.
+
+    union: Set[int] = set()
+    total: int = 0
+    for result in results:
+        union |= result
+        total += len(result)
+        # Also make sure the size of the set is within the expected bounds.
+        assert minimum_expected_result_size <= len(result)
+        assert len(result) <= maximum_expected_result_size
+
+    assert len(union) == total == (max_instances or TOTAL_INSTANCES)
diff --git a/tests/data/dataset_readers/interleaving_dataset_reader_test.py b/tests/data/dataset_readers/interleaving_dataset_reader_test.py
index cdd7de2a3be..5e32138eae0 100644
--- a/tests/data/dataset_readers/interleaving_dataset_reader_test.py
+++ b/tests/data/dataset_readers/interleaving_dataset_reader_test.py
@@ -32,11 +32,11 @@ def test_round_robin(self):
         reader = InterleavingDatasetReader(readers)
         data_dir = self.FIXTURES_ROOT / "data"
 
-        file_path = f"""{{
-            "a": "{data_dir / 'babi.txt'}",
-            "b": "{data_dir / 'conll2003.txt'}",
-            "c": "{data_dir / 'conll2003.txt'}"
-        }}"""
+        file_path = {
+            "a": data_dir / "babi.txt",
+            "b": data_dir / "conll2003.txt",
+            "c": data_dir / "conll2003.txt",
+        }
 
         instances = list(reader.read(file_path))
         first_three_keys = {instance.fields["dataset"].metadata for instance in instances[:3]}
diff --git a/tests/data/dataset_readers/lazy_dataset_reader_test.py b/tests/data/dataset_readers/lazy_dataset_reader_test.py
deleted file mode 100644
index 55ded98d6cf..00000000000
--- a/tests/data/dataset_readers/lazy_dataset_reader_test.py
+++ /dev/null
@@ -1,62 +0,0 @@
-from typing import Iterable, List
-
-from allennlp.data.fields import TextField
-from allennlp.data.instance import Instance
-from allennlp.data.dataset_readers import DatasetReader
-from allennlp.data.token_indexers import SingleIdTokenIndexer
-from allennlp.data.tokenizers import Token
-from allennlp.common.testing import AllenNlpTestCase
-from allennlp.common.util import ensure_list
-
-
-class LazyDatasetReader(DatasetReader):
-    def __init__(self, instances: List[Instance], lazy: bool) -> None:
-        super().__init__()
-        self.lazy = lazy
-        self._instances = instances
-        self.num_reads = 0
-
-    def _read(self, _: str) -> Iterable[Instance]:
-        self.num_reads += 1
-        return (instance for instance in self._instances)
-
-
-class TestLazyDatasetReader(AllenNlpTestCase):
-    def setup_method(self):
-        super().setup_method()
-        token_indexer = {"tokens": SingleIdTokenIndexer()}
-
-        field1 = TextField([Token(t) for t in ["this", "is", "a", "sentence", "."]], token_indexer)
-        field2 = TextField(
-            [Token(t) for t in ["this", "is", "a", "different", "sentence", "."]], token_indexer
-        )
-        field3 = TextField([Token(t) for t in ["here", "is", "a", "sentence", "."]], token_indexer)
-        field4 = TextField([Token(t) for t in ["this", "is", "short"]], token_indexer)
-        self.instances = [
-            Instance({"text1": field1, "text2": field2}),
-            Instance({"text1": field3, "text2": field4}),
-        ]
-
-    def test_lazy(self):
-        reader = LazyDatasetReader(self.instances, lazy=True)
-        assert reader.num_reads == 0
-
-        instances = reader.read("path/to/file")
-
-        for _ in range(10):
-            _instances = (i for i in instances)
-            assert ensure_list(_instances) == self.instances
-
-        assert reader.num_reads == 10
-
-    def test_non_lazy(self):
-        reader = LazyDatasetReader(self.instances, lazy=False)
-        assert reader.num_reads == 0
-
-        instances = reader.read("path/to/file")
-
-        for _ in range(10):
-            _instances = (i for i in instances)
-            assert ensure_list(_instances) == self.instances
-
-        assert reader.num_reads == 1
diff --git a/tests/data/dataset_readers/sequence_tagging_test.py b/tests/data/dataset_readers/sequence_tagging_test.py
index 23ce6234456..1da3fca977b 100644
--- a/tests/data/dataset_readers/sequence_tagging_test.py
+++ b/tests/data/dataset_readers/sequence_tagging_test.py
@@ -1,16 +1,13 @@
-import pytest
-
 from allennlp.data.dataset_readers import SequenceTaggingDatasetReader
-from allennlp.common.util import ensure_list
 from allennlp.common.testing import AllenNlpTestCase
 
 
 class TestSequenceTaggingDatasetReader:
-    @pytest.mark.parametrize("lazy", (True, False))
-    def test_default_format(self, lazy):
-        reader = SequenceTaggingDatasetReader(lazy=lazy)
-        instances = reader.read(AllenNlpTestCase.FIXTURES_ROOT / "data" / "sequence_tagging.tsv")
-        instances = ensure_list(instances)
+    def test_default_format(self):
+        reader = SequenceTaggingDatasetReader()
+        instances = list(
+            reader.read(AllenNlpTestCase.FIXTURES_ROOT / "data" / "sequence_tagging.tsv")
+        )
 
         assert len(instances) == 4
         fields = instances[0].fields
@@ -28,8 +25,7 @@ def test_default_format(self, lazy):
 
     def test_brown_corpus_format(self):
         reader = SequenceTaggingDatasetReader(word_tag_delimiter="/")
-        instances = reader.read(AllenNlpTestCase.FIXTURES_ROOT / "data" / "brown_corpus.txt")
-        instances = ensure_list(instances)
+        instances = list(reader.read(AllenNlpTestCase.FIXTURES_ROOT / "data" / "brown_corpus.txt"))
 
         assert len(instances) == 4
         fields = instances[0].fields
diff --git a/tests/data/dataset_readers/sharded_dataset_reader_test.py b/tests/data/dataset_readers/sharded_dataset_reader_test.py
index b0a32b210ad..d1fa329ec28 100644
--- a/tests/data/dataset_readers/sharded_dataset_reader_test.py
+++ b/tests/data/dataset_readers/sharded_dataset_reader_test.py
@@ -4,13 +4,10 @@
 from collections import Counter
 from typing import Tuple
 
-import pytest
-
 from allennlp.common.testing import AllenNlpTestCase
 from allennlp.data.dataset_readers import (
     SequenceTaggingDatasetReader,
     ShardedDatasetReader,
-    DatasetReader,
 )
 from allennlp.data.instance import Instance
 
@@ -25,27 +22,12 @@ def fingerprint(instance: Instance) -> Tuple[str, ...]:
     return text_tuple + labels_tuple
 
 
-def test_exception_raised_when_base_reader_implements_sharding():
-    class ManuallyShardedBaseReader(DatasetReader):
-        def __init__(self, **kwargs):
-            super().__init__(manual_distributed_sharding=True, **kwargs)
-
-        def _read(self, file_path: str):
-            pass
-
-        def text_to_instance(self, text: str):  # type: ignore
-            pass
-
-    with pytest.raises(ValueError, match="should not implement manual distributed sharding"):
-        ShardedDatasetReader(ManuallyShardedBaseReader())
-
-
 class TestShardedDatasetReader(AllenNlpTestCase):
     def setup_method(self) -> None:
         super().setup_method()
 
         # use SequenceTaggingDatasetReader as the base reader
-        self.base_reader = SequenceTaggingDatasetReader(lazy=True)
+        self.base_reader = SequenceTaggingDatasetReader()
         base_file_path = AllenNlpTestCase.FIXTURES_ROOT / "data" / "sequence_tagging.tsv"
 
         # Make 100 copies of the data
@@ -91,20 +73,3 @@ def test_sharded_read_glob(self):
 
     def test_sharded_read_archive(self):
         self.read_and_check_instances(str(self.archive_filename))
-
-    def test_attributes_inheritance(self):
-        # current reader has lazy set to true
-        base_reader = SequenceTaggingDatasetReader(lazy=True)
-        reader = ShardedDatasetReader(base_reader=base_reader)
-
-        assert (
-            reader.lazy
-        ), "The ShardedDatasetReader didn't inherit the 'lazy' attribute from base_reader"
-
-    def test_set_attributes_main(self):
-        base_reader = SequenceTaggingDatasetReader(lazy=True)
-        reader = ShardedDatasetReader(base_reader=base_reader, lazy=False)
-
-        assert (
-            not reader.lazy
-        ), "The ShardedDatasetReader inherited the 'lazy' attribute from base_reader. It should be False"
diff --git a/tests/data/dataset_readers/text_classification_json_test.py b/tests/data/dataset_readers/text_classification_json_test.py
index 4baf5f7c30b..88d72dc0b4b 100644
--- a/tests/data/dataset_readers/text_classification_json_test.py
+++ b/tests/data/dataset_readers/text_classification_json_test.py
@@ -2,24 +2,21 @@
 from typing import List
 
 from allennlp.data.dataset_readers import TextClassificationJsonReader
-from allennlp.common.util import ensure_list
 from allennlp.common.testing import AllenNlpTestCase
 from allennlp.data.tokenizers.sentence_splitter import SpacySentenceSplitter
 from allennlp.common.util import get_spacy_model
 
 
 class TestTextClassificationJsonReader:
-    @pytest.mark.parametrize("lazy", (True, False))
-    def test_set_skip_indexing_true(self, lazy):
-        reader = TextClassificationJsonReader(lazy=lazy, skip_label_indexing=True)
+    def test_set_skip_indexing_true(self):
+        reader = TextClassificationJsonReader(skip_label_indexing=True)
         ag_path = (
             AllenNlpTestCase.FIXTURES_ROOT
             / "data"
             / "text_classification_json"
             / "integer_labels.jsonl"
         )
-        instances = reader.read(ag_path)
-        instances = ensure_list(instances)
+        instances = list(reader.read(ag_path))
 
         instance1 = {"tokens": ["This", "text", "has", "label", "0"], "label": 0}
         instance2 = {"tokens": ["This", "text", "has", "label", "1"], "label": 1}
@@ -39,20 +36,18 @@ def test_set_skip_indexing_true(self, lazy):
                 / "text_classification_json"
                 / "imdb_corpus.jsonl"
             )
-            ensure_list(reader.read(ag_path))
+            list(reader.read(ag_path))
         assert str(exec_info.value) == "Labels must be integers if skip_label_indexing is True."
 
-    @pytest.mark.parametrize("lazy", (True, False))
-    def test_read_from_file_ag_news_corpus(self, lazy):
-        reader = TextClassificationJsonReader(lazy=lazy)
+    def test_read_from_file_ag_news_corpus(self):
+        reader = TextClassificationJsonReader()
         ag_path = (
             AllenNlpTestCase.FIXTURES_ROOT
             / "data"
             / "text_classification_json"
             / "ag_news_corpus.jsonl"
         )
-        instances = reader.read(ag_path)
-        instances = ensure_list(instances)
+        instances = list(reader.read(ag_path))
 
         instance1 = {
             "tokens": [
@@ -181,17 +176,15 @@ def test_read_from_file_ag_news_corpus(self, lazy):
         assert [t.text for t in fields["tokens"].tokens] == instance3["tokens"]
         assert fields["label"].label == instance3["label"]
 
-    @pytest.mark.parametrize("lazy", (True, False))
-    def test_read_from_file_ag_news_corpus_and_truncates_properly(self, lazy):
-        reader = TextClassificationJsonReader(lazy=lazy, max_sequence_length=5)
+    def test_read_from_file_ag_news_corpus_and_truncates_properly(self):
+        reader = TextClassificationJsonReader(max_sequence_length=5)
         ag_path = (
             AllenNlpTestCase.FIXTURES_ROOT
             / "data"
             / "text_classification_json"
             / "ag_news_corpus.jsonl"
         )
-        instances = reader.read(ag_path)
-        instances = ensure_list(instances)
+        instances = list(reader.read(ag_path))
 
         instance1 = {"tokens": ["Memphis", "Rout", "Still", "Stings", "for"], "label": "2"}
         instance2 = {"tokens": ["AP", "-", "Eli", "Manning", "has"], "label": "2"}
@@ -209,12 +202,11 @@ def test_read_from_file_ag_news_corpus_and_truncates_properly(self, lazy):
         assert fields["label"].label == instance3["label"]
 
     @pytest.mark.parametrize("max_sequence_length", (None, 5))
-    @pytest.mark.parametrize("lazy", (True, False))
     def test_read_from_file_ag_news_corpus_and_segments_sentences_properly(
-        self, lazy, max_sequence_length
+        self, max_sequence_length
     ):
         reader = TextClassificationJsonReader(
-            lazy=lazy, segment_sentences=True, max_sequence_length=max_sequence_length
+            segment_sentences=True, max_sequence_length=max_sequence_length
         )
         ag_path = (
             AllenNlpTestCase.FIXTURES_ROOT
@@ -222,8 +214,7 @@ def test_read_from_file_ag_news_corpus_and_segments_sentences_properly(
             / "text_classification_json"
             / "ag_news_corpus.jsonl"
         )
-        instances = reader.read(ag_path)
-        instances = ensure_list(instances)
+        instances = list(reader.read(ag_path))
 
         splitter = SpacySentenceSplitter()
         spacy_tokenizer = get_spacy_model("en_core_web_sm", False, False, False)
diff --git a/tests/data/fields/list_field_test.py b/tests/data/fields/list_field_test.py
index 2356d9b3646..cdf2ad97d87 100644
--- a/tests/data/fields/list_field_test.py
+++ b/tests/data/fields/list_field_test.py
@@ -7,8 +7,7 @@
 from allennlp.data import Token, Vocabulary, Instance
 from allennlp.data.fields import TextField, LabelField, ListField, IndexField, SequenceLabelField
 from allennlp.data.token_indexers import SingleIdTokenIndexer, TokenCharactersIndexer
-from allennlp.data.dataloader import PyTorchDataLoader
-from allennlp.data.dataset_readers.dataset_reader import AllennlpDataset
+from allennlp.data.data_loaders import SimpleDataLoader
 from allennlp.data.tokenizers import SpacyTokenizer
 from allennlp.models import Model
 from allennlp.modules import Embedding
@@ -297,11 +296,10 @@ def test_empty_list_can_be_tensorized(self):
         instance.as_tensor_dict()
 
     def test_batch_with_some_empty_lists_works(self):
-        dataset = AllennlpDataset([self.empty_instance, self.non_empty_instance], self.vocab)
-
+        instances = [self.empty_instance, self.non_empty_instance]
         model = DummyModel(self.vocab)
         model.eval()
-        loader = PyTorchDataLoader(dataset, batch_size=2)
+        loader = SimpleDataLoader(instances, 2, vocab=self.vocab)
         batch = next(iter(loader))
         model.forward(**batch)
 
@@ -312,11 +310,10 @@ def test_batch_with_some_empty_lists_works(self):
     # makes a whole lot more sense to just have a minimally-sized tensor that
     # gets entirely masked and has no effect on the rest of the model.
     def test_batch_of_entirely_empty_lists_works(self):
-        dataset = AllennlpDataset([self.empty_instance, self.empty_instance], self.vocab)
-
+        instances = [self.empty_instance, self.empty_instance]
         model = DummyModel(self.vocab)
         model.eval()
-        loader = PyTorchDataLoader(dataset, batch_size=2)
+        loader = SimpleDataLoader(instances, 2, vocab=self.vocab)
         batch = next(iter(loader))
         model.forward(**batch)
 
diff --git a/tests/data/fields/array_field_test.py b/tests/data/fields/tensor_field_test.py
similarity index 77%
rename from tests/data/fields/array_field_test.py
rename to tests/data/fields/tensor_field_test.py
index fbb0eb7da84..54ed9ac3c13 100644
--- a/tests/data/fields/array_field_test.py
+++ b/tests/data/fields/tensor_field_test.py
@@ -2,14 +2,14 @@
 import torch
 
 from allennlp.common.testing.test_case import AllenNlpTestCase
-from allennlp.data.fields import ArrayField, ListField
+from allennlp.data.fields import TensorField, ListField
 
 
-class TestArrayField(AllenNlpTestCase):
+class TestTensorField(AllenNlpTestCase):
     def test_get_padding_lengths_correctly_returns_ordered_shape(self):
         shape = [3, 4, 5, 6]
         array = numpy.zeros(shape)
-        array_field = ArrayField(array)
+        array_field = TensorField(array)
         lengths = array_field.get_padding_lengths()
         for i in range(len(lengths)):
             assert lengths["dimension_{}".format(i)] == shape[i]
@@ -17,7 +17,7 @@ def test_get_padding_lengths_correctly_returns_ordered_shape(self):
     def test_as_tensor_handles_larger_padding_dimensions(self):
         shape = [3, 4]
         array = numpy.ones(shape)
-        array_field = ArrayField(array)
+        array_field = TensorField(array)
 
         padded_tensor = (
             array_field.as_tensor({"dimension_0": 5, "dimension_1": 6}).detach().cpu().numpy()
@@ -26,8 +26,8 @@ def test_as_tensor_handles_larger_padding_dimensions(self):
         numpy.testing.assert_array_equal(padded_tensor[3:, 4:], 0.0)
 
     def test_padding_handles_list_fields(self):
-        array1 = ArrayField(numpy.ones([2, 3]))
-        array2 = ArrayField(numpy.ones([1, 5]))
+        array1 = TensorField(numpy.ones([2, 3]))
+        array2 = TensorField(numpy.ones([1, 5]))
         empty_array = array1.empty_field()
         list_field = ListField([array1, array2, empty_array])
 
@@ -44,8 +44,8 @@ def test_padding_handles_list_fields(self):
         numpy.testing.assert_array_equal(returned_tensor, correct_tensor)
 
     def test_padding_handles_list_fields_with_padding_values(self):
-        array1 = ArrayField(numpy.ones([2, 3]), padding_value=-1)
-        array2 = ArrayField(numpy.ones([1, 5]), padding_value=-1)
+        array1 = TensorField(numpy.ones([2, 3]), padding_value=-1)
+        array2 = TensorField(numpy.ones([1, 5]), padding_value=-1)
         empty_array = array1.empty_field()
         list_field = ListField([array1, array2, empty_array])
 
@@ -62,17 +62,17 @@ def test_padding_handles_list_fields_with_padding_values(self):
         numpy.testing.assert_array_equal(returned_tensor, correct_tensor)
 
     def test_printing_doesnt_crash(self):
-        array = ArrayField(numpy.ones([2, 3]), padding_value=-1)
+        array = TensorField(numpy.ones([2, 3]), padding_value=-1)
         print(array)
 
     def test_as_tensor_works_with_scalar(self):
-        array = ArrayField(numpy.asarray(42))
+        array = TensorField(numpy.asarray(42))
         returned_tensor = array.as_tensor(array.get_padding_lengths())
         current_tensor = numpy.asarray(42)
         numpy.testing.assert_array_equal(returned_tensor, current_tensor)
 
     def test_as_tensor_with_scalar_keeps_dtype(self):
-        array = ArrayField(numpy.asarray(42, dtype=numpy.float32))
+        array = TensorField(numpy.asarray(42, dtype=numpy.float32))
         returned_tensor = array.as_tensor(array.get_padding_lengths())
         assert returned_tensor.dtype == torch.float32
 
@@ -82,13 +82,13 @@ def test_alternative_dtypes(self):
 
         # Setting dtype to numpy.int64 should produce a torch.LongTensor when field is converted to
         # a tensor
-        array_field1 = ArrayField(array, dtype=numpy.int64)
+        array_field1 = TensorField(array, dtype=numpy.int64)
         returned_tensor1 = array_field1.as_tensor(array_field1.get_padding_lengths())
         assert returned_tensor1.dtype == torch.int64
 
         # Setting dtype to numpy.uint8 should produce a torch.ByteTensor when field is converted to
         # a tensor
-        array_field2 = ArrayField(array, dtype=numpy.uint8)
+        array_field2 = TensorField(array, dtype=numpy.uint8)
         returned_tensor2 = array_field2.as_tensor(array_field2.get_padding_lengths())
         assert returned_tensor2.dtype == torch.uint8
 
@@ -99,17 +99,17 @@ def test_alternative_dtypes(self):
 
         # Empty fields should have the same dtype
         empty_field = array_field2.empty_field()
-        assert empty_field.dtype == array_field2.dtype
+        assert empty_field.tensor.dtype == array_field2.tensor.dtype
 
     def test_len_works_with_scalar(self):
-        array = ArrayField(numpy.asarray(42))
+        array = TensorField(numpy.asarray(42))
         assert len(array) == 1
 
     def test_eq(self):
-        array1 = ArrayField(numpy.asarray([1, 1, 1]))
-        array2 = ArrayField(numpy.asarray([[1, 1, 1], [1, 1, 1]]))
-        array3 = ArrayField(numpy.asarray([1, 1, 2]))
-        array4 = ArrayField(numpy.asarray([1, 1, 1]))
+        array1 = TensorField(numpy.asarray([1, 1, 1]))
+        array2 = TensorField(numpy.asarray([[1, 1, 1], [1, 1, 1]]))
+        array3 = TensorField(numpy.asarray([1, 1, 2]))
+        array4 = TensorField(numpy.asarray([1, 1, 1]))
         assert array1 != array2
         assert array1 != array3
         assert array1 == array4
diff --git a/tests/data/image_loader_test.py b/tests/data/image_loader_test.py
new file mode 100644
index 00000000000..096f1770639
--- /dev/null
+++ b/tests/data/image_loader_test.py
@@ -0,0 +1,80 @@
+import pytest
+import torch
+import torchvision
+
+from allennlp.common.testing import AllenNlpTestCase, multi_device
+from allennlp.data.image_loader import TorchImageLoader
+
+
+class TorchImageLoaderTest(AllenNlpTestCase):
+    def setup_method(self):
+        super().setup_method()
+        self.image_fixture_path = str(
+            self.FIXTURES_ROOT / "data" / "images" / "COCO_train2014_000000458752.jpg"
+        )
+
+        # Create a few small images of different sizes from the fixture.
+        image = torchvision.io.read_image(self.image_fixture_path)
+        assert image.shape == (3, 480, 640)
+
+        image1 = image[:, 0:7, 0:15]
+        image2 = image[:, 0:9, 0:12]
+        torchvision.io.write_jpeg(image1, str(self.TEST_DIR / "image1.jpg"))
+        torchvision.io.write_jpeg(image2, str(self.TEST_DIR / "image2.jpg"))
+
+    @multi_device
+    @pytest.mark.parametrize(
+        "loader_params",
+        [
+            {"size_divisibility": 0, "pad_value": 0.0},
+            {"size_divisibility": 1, "pad_value": 0.0},
+            {"size_divisibility": 4, "pad_value": 0.0},
+        ],
+        ids=str,
+    )
+    def test_basic_load(self, device, loader_params):
+        loader = TorchImageLoader(resize=False, normalize=False, device=device, **loader_params)
+        torch_device = torch.device(device)
+        images, sizes = loader([self.TEST_DIR / "image1.jpg", self.TEST_DIR / "image2.jpg"])
+        assert images.device == torch_device
+        assert sizes.device == torch_device
+        assert images.shape[0] == 2
+        assert images.shape[1] == 3
+        assert sizes.shape == (2, 2)
+        assert list(sizes[0]) == [7, 15]
+        assert list(sizes[1]) == [9, 12]
+        if loader.size_divisibility <= 1:
+            assert images.shape[2] == 9
+            assert images.shape[3] == 15
+        else:
+            assert images.shape[2] >= 9
+            assert images.shape[3] >= 15
+            assert (images.shape[2] / loader.size_divisibility) % 1 == 0
+
+        image, size = loader(self.TEST_DIR / "image1.jpg")
+        assert image.device == torch_device
+        assert size.device == torch_device
+        assert len(image.shape) == 3
+        assert list(size) == [7, 15]
+
+    @multi_device
+    def test_resize_and_normalize(self, device):
+        loader = TorchImageLoader(resize=True, normalize=True, device=device)
+        torch_device = torch.device(device)
+        image, size = loader(self.image_fixture_path)
+        assert image.device == torch_device
+        assert size.device == torch_device
+        assert image.shape[1] == 800
+
+    def test_resize_and_normalize_matches_generalized_rcnn_transform(self):
+        loader = TorchImageLoader(resize=True, normalize=True, size_divisibility=32)
+        transform = torchvision.models.detection.transform.GeneralizedRCNNTransform(
+            loader.min_size, loader.max_size, loader.pixel_mean, loader.pixel_std
+        )
+
+        loaded_image, _ = loader([self.image_fixture_path])
+
+        raw_image, _ = TorchImageLoader(resize=False, normalize=False)(self.image_fixture_path)
+        transformed_raw_image, _ = transform([raw_image])
+
+        assert loaded_image.shape == transformed_raw_image.tensors.shape
diff --git a/tests/data/samplers/bucket_batch_sampler_test.py b/tests/data/samplers/bucket_batch_sampler_test.py
index dc71aa2efaa..3a972facdc2 100644
--- a/tests/data/samplers/bucket_batch_sampler_test.py
+++ b/tests/data/samplers/bucket_batch_sampler_test.py
@@ -1,21 +1,18 @@
 from allennlp.common import Params
-from allennlp.data import Instance, Token
-from allennlp.data.batch import Batch
+from allennlp.data import Instance, Token, Batch
 from allennlp.data.fields import TextField
 from allennlp.data.samplers import BucketBatchSampler
-from allennlp.data.dataset_readers.dataset_reader import AllennlpDataset
-from allennlp.data.dataloader import PyTorchDataLoader
+from allennlp.data.data_loaders import MultiProcessDataLoader
 
 from .sampler_test import SamplerTest
 
 
 class TestBucketSampler(SamplerTest):
     def test_create_batches_groups_correctly(self):
-        dataset = AllennlpDataset(self.instances, vocab=self.vocab)
-        sampler = BucketBatchSampler(dataset, batch_size=2, padding_noise=0, sorting_keys=["text"])
+        sampler = BucketBatchSampler(batch_size=2, padding_noise=0, sorting_keys=["text"])
 
         grouped_instances = []
-        for indices in sampler:
+        for indices in sampler.get_batch_indices(self.instances):
             grouped_instances.append([self.instances[idx] for idx in indices])
         expected_groups = [
             [self.instances[4], self.instances[2]],
@@ -28,8 +25,7 @@ def test_create_batches_groups_correctly(self):
         assert expected_groups == []
 
     def test_guess_sorting_key_picks_the_longest_key(self):
-        dataset = AllennlpDataset(self.instances, vocab=self.vocab)
-        sampler = BucketBatchSampler(dataset, batch_size=2, padding_noise=0)
+        sampler = BucketBatchSampler(batch_size=2, padding_noise=0)
         instances = []
         short_tokens = [Token(t) for t in ["what", "is", "this", "?"]]
         long_tokens = [Token(t) for t in ["this", "is", "a", "not", "very", "long", "passage"]]
@@ -62,13 +58,12 @@ def test_guess_sorting_key_picks_the_longest_key(self):
         assert sampler.sorting_keys == ["passage"]
 
     def test_from_params(self):
-        dataset = AllennlpDataset(self.instances, self.vocab)
         params = Params({})
 
         sorting_keys = ["s1", "s2"]
         params["sorting_keys"] = sorting_keys
         params["batch_size"] = 32
-        sampler = BucketBatchSampler.from_params(params=params, data_source=dataset)
+        sampler = BucketBatchSampler.from_params(params=params)
 
         assert sampler.sorting_keys == sorting_keys
         assert sampler.padding_noise == 0.1
@@ -83,27 +78,33 @@ def test_from_params(self):
             }
         )
 
-        sampler = BucketBatchSampler.from_params(params=params, data_source=dataset)
+        sampler = BucketBatchSampler.from_params(params=params)
         assert sampler.sorting_keys == sorting_keys
         assert sampler.padding_noise == 0.5
         assert sampler.batch_size == 100
         assert sampler.drop_last
 
     def test_drop_last_works(self):
-        dataset = AllennlpDataset(self.instances, vocab=self.vocab)
         sampler = BucketBatchSampler(
-            dataset,
             batch_size=2,
             padding_noise=0,
             sorting_keys=["text"],
             drop_last=True,
         )
+
         # We use a custom collate_fn for testing, which doesn't actually create tensors,
         # just the allennlp Batches.
-        dataloader = PyTorchDataLoader(
-            dataset, batch_sampler=sampler, collate_fn=lambda x: Batch(x)
+        def collate_fn(x, **kwargs):
+            return Batch(x)
+
+        data_loader = MultiProcessDataLoader(
+            self.get_mock_reader(),
+            "fake_path",
+            batch_sampler=sampler,
         )
-        batches = [batch for batch in iter(dataloader)]
+        data_loader.collate_fn = collate_fn
+        data_loader.index_with(self.vocab)
+        batches = [batch for batch in iter(data_loader)]
         stats = self.get_batches_stats(batches)
 
         # all batches have length batch_size
@@ -113,29 +114,21 @@ def test_drop_last_works(self):
         assert stats["total_instances"] == len(self.instances) - 1
 
     def test_batch_count(self):
-        dataset = AllennlpDataset(self.instances, vocab=self.vocab)
-        sampler = BucketBatchSampler(dataset, batch_size=2, padding_noise=0, sorting_keys=["text"])
-        # We use a custom collate_fn for testing, which doesn't actually create tensors,
-        # just the allennlp Batches.
-        dataloader = PyTorchDataLoader(
-            dataset, batch_sampler=sampler, collate_fn=lambda x: Batch(x)
+        sampler = BucketBatchSampler(batch_size=2, padding_noise=0, sorting_keys=["text"])
+        data_loader = MultiProcessDataLoader(
+            self.get_mock_reader(), "fake_path", batch_sampler=sampler
         )
-
-        assert len(dataloader) == 3
+        data_loader.index_with(self.vocab)
+        assert len(data_loader) == 3
 
     def test_batch_count_with_drop_last(self):
-        dataset = AllennlpDataset(self.instances, vocab=self.vocab)
         sampler = BucketBatchSampler(
-            dataset,
             batch_size=2,
             padding_noise=0,
             sorting_keys=["text"],
             drop_last=True,
         )
-        # We use a custom collate_fn for testing, which doesn't actually create tensors,
-        # just the allennlp Batches.
-        dataloader = PyTorchDataLoader(
-            dataset, batch_sampler=sampler, collate_fn=lambda x: Batch(x)
+        data_loader = MultiProcessDataLoader(
+            self.get_mock_reader(), "fake_path", batch_sampler=sampler
         )
-
-        assert len(dataloader) == 2
+        assert len(data_loader) == 2
diff --git a/tests/data/samplers/max_tokens_batch_sampler_test.py b/tests/data/samplers/max_tokens_batch_sampler_test.py
index 04e5c87ca6c..a3b7e094733 100644
--- a/tests/data/samplers/max_tokens_batch_sampler_test.py
+++ b/tests/data/samplers/max_tokens_batch_sampler_test.py
@@ -1,23 +1,17 @@
-from allennlp.common import Params
 from allennlp.data import Instance, Token
-from allennlp.data.batch import Batch
 from allennlp.data.fields import TextField
 from allennlp.data.samplers import MaxTokensBatchSampler
-from allennlp.data.dataset_readers.dataset_reader import AllennlpDataset
-from allennlp.data.dataloader import PyTorchDataLoader
+from allennlp.data.data_loaders import MultiProcessDataLoader
 
 from .sampler_test import SamplerTest
 
 
 class TestMaxTokensSampler(SamplerTest):
     def test_create_batches_groups_correctly(self):
-        dataset = AllennlpDataset(self.instances, vocab=self.vocab)
-        sampler = MaxTokensBatchSampler(
-            dataset, max_tokens=8, padding_noise=0, sorting_keys=["text"]
-        )
+        sampler = MaxTokensBatchSampler(max_tokens=8, padding_noise=0, sorting_keys=["text"])
 
         grouped_instances = []
-        for indices in sampler:
+        for indices in sampler.get_batch_indices(self.instances):
             grouped_instances.append([self.instances[idx] for idx in indices])
         expected_groups = [
             [self.instances[4], self.instances[2]],
@@ -30,8 +24,7 @@ def test_create_batches_groups_correctly(self):
         assert expected_groups == []
 
     def test_guess_sorting_key_picks_the_longest_key(self):
-        dataset = AllennlpDataset(self.instances, vocab=self.vocab)
-        sampler = MaxTokensBatchSampler(dataset, max_tokens=8, padding_noise=0)
+        sampler = MaxTokensBatchSampler(max_tokens=8, padding_noise=0)
         instances = []
         short_tokens = [Token(t) for t in ["what", "is", "this", "?"]]
         long_tokens = [Token(t) for t in ["this", "is", "a", "not", "very", "long", "passage"]]
@@ -63,35 +56,9 @@ def test_guess_sorting_key_picks_the_longest_key(self):
         sampler._guess_sorting_keys(instances)
         assert sampler.sorting_keys == ["passage"]
 
-    def test_from_params(self):
-        dataset = AllennlpDataset(self.instances, self.vocab)
-        params = Params({})
-
-        sorting_keys = ["s1", "s2"]
-        params["sorting_keys"] = sorting_keys
-        params["max_tokens"] = 32
-        sampler = MaxTokensBatchSampler.from_params(params=params, data_source=dataset)
-
-        assert sampler.sorting_keys == sorting_keys
-        assert sampler.padding_noise == 0.1
-        assert sampler.max_tokens == 32
-
-        params = Params({"sorting_keys": sorting_keys, "padding_noise": 0.5, "max_tokens": 100})
-
-        sampler = MaxTokensBatchSampler.from_params(params=params, data_source=dataset)
-        assert sampler.sorting_keys == sorting_keys
-        assert sampler.padding_noise == 0.5
-        assert sampler.max_tokens == 100
-
     def test_batch_count(self):
-        dataset = AllennlpDataset(self.instances, vocab=self.vocab)
-        sampler = MaxTokensBatchSampler(
-            dataset, max_tokens=8, padding_noise=0, sorting_keys=["text"]
+        sampler = MaxTokensBatchSampler(max_tokens=8, padding_noise=0, sorting_keys=["text"])
+        data_loader = MultiProcessDataLoader(
+            self.get_mock_reader(), "fake_path", batch_sampler=sampler
         )
-        # We use a custom collate_fn for testing, which doesn't actually create tensors,
-        # just the allennlp Batches.
-        dataloader = PyTorchDataLoader(
-            dataset, batch_sampler=sampler, collate_fn=lambda x: Batch(x)
-        )
-
-        assert len(dataloader) == 3
+        assert len(data_loader) == 3
diff --git a/tests/data/samplers/sampler_test.py b/tests/data/samplers/sampler_test.py
index e981d41ebec..3be895f8657 100644
--- a/tests/data/samplers/sampler_test.py
+++ b/tests/data/samplers/sampler_test.py
@@ -1,7 +1,7 @@
 from typing import List, Iterable, Dict, Union
 
 from allennlp.common.testing import AllenNlpTestCase
-from allennlp.data import Vocabulary, Instance, Token, Batch
+from allennlp.data import Vocabulary, Instance, Token, Batch, DatasetReader
 from allennlp.data.fields import TextField
 from allennlp.data.token_indexers import SingleIdTokenIndexer
 
@@ -40,9 +40,22 @@ def setup_method(self):
         self.instances = instances
         self.lazy_instances = LazyIterable(instances)
 
+    def get_mock_reader(self) -> DatasetReader:
+        class MockReader(DatasetReader):
+            def __init__(self, instances, **kwargs):
+                super().__init__(**kwargs)
+                self.instances = instances
+
+            def _read(self, file_path: str):
+                for instance in self.instances:
+                    yield instance
+
+        return MockReader(self.instances)
+
     def create_instance(self, str_tokens: List[str]):
         tokens = [Token(t) for t in str_tokens]
         instance = Instance({"text": TextField(tokens, self.token_indexers)})
+        instance.index_fields(self.vocab)
         return instance
 
     def create_instances_from_token_counts(self, token_counts: List[int]) -> List[Instance]:
diff --git a/tests/models/multitask_test.py b/tests/models/multitask_test.py
new file mode 100644
index 00000000000..43ffc50a33a
--- /dev/null
+++ b/tests/models/multitask_test.py
@@ -0,0 +1,103 @@
+import pytest
+
+from allennlp.common.testing import ModelTestCase
+from allennlp.data import Instance, Vocabulary, Batch
+from allennlp.data.fields import LabelField, TextField, MetadataField
+from allennlp.data.token_indexers import PretrainedTransformerIndexer
+from allennlp.data.tokenizers import PretrainedTransformerTokenizer
+from allennlp.models.heads import ClassifierHead
+from allennlp.models import MultiTaskModel
+from allennlp.modules.backbones import PretrainedTransformerBackbone
+from allennlp.modules.seq2vec_encoders import ClsPooler
+
+
+class TestMultiTaskModel(ModelTestCase):
+    def test_forward_works(self):
+        # Setting up the model.
+        transformer_name = "epwalsh/bert-xsmall-dummy"
+        vocab = Vocabulary()
+        backbone = PretrainedTransformerBackbone(vocab, transformer_name)
+        head1 = ClassifierHead(vocab, seq2vec_encoder=ClsPooler(20), input_dim=20, num_labels=3)
+        head2 = ClassifierHead(vocab, seq2vec_encoder=ClsPooler(20), input_dim=20, num_labels=4)
+        # We'll start with one head, and add another later.
+        model = MultiTaskModel(vocab, backbone, {"cls": head1})
+
+        # Setting up the data.
+        tokenizer = PretrainedTransformerTokenizer(model_name=transformer_name)
+        token_indexers = PretrainedTransformerIndexer(model_name=transformer_name)
+        tokens = tokenizer.tokenize("This is a test")
+        text_field = TextField(tokens, {"tokens": token_indexers})
+        label_field1 = LabelField(1, skip_indexing=True)
+        label_field2 = LabelField(3, skip_indexing=True)
+        instance = Instance(
+            {"text": text_field, "label": label_field1, "task": MetadataField("cls")}
+        )
+
+        # Now we run some tests.  First, the default.
+        outputs = model.forward_on_instance(instance)
+        assert "encoded_text" in outputs
+        assert "cls_logits" in outputs
+        assert "loss" in outputs
+        assert "cls_loss" in outputs
+
+        # When we don't have labels.
+        instance = Instance({"text": text_field, "task": MetadataField("cls")})
+        outputs = model.forward_on_instance(instance)
+        assert "encoded_text" in outputs
+        assert "cls_logits" in outputs
+        assert "loss" not in outputs
+
+        # Same in eval mode
+        model.eval()
+        outputs = model.forward_on_instance(instance)
+        assert "encoded_text" in outputs
+        assert "loss" not in outputs  # no loss because we have no labels
+        assert "cls_logits" in outputs  # but we can compute logits
+        model.train()
+
+        # Now for two headed and other more complex tests.
+        model = MultiTaskModel(
+            vocab,
+            backbone,
+            {"cls1": head1, "cls2": head2},
+            arg_name_mapping={
+                "backbone": {"question": "text"},
+            },
+        )
+
+        # Basic case where things should work, with two heads that both need label inputs.
+        instance1 = Instance(
+            {"text": text_field, "label": label_field1, "task": MetadataField("cls1")}
+        )
+        instance2 = Instance(
+            {"text": text_field, "label": label_field2, "task": MetadataField("cls2")}
+        )
+        batch = Batch([instance1, instance2])
+        outputs = model.forward(**batch.as_tensor_dict())
+        assert "encoded_text" in outputs
+        assert "cls1_logits" in outputs
+        assert "cls1_loss" in outputs
+        assert "cls2_logits" in outputs
+        assert "cls2_loss" in outputs
+        assert "loss" in outputs
+        combined_loss = outputs["cls1_loss"].item() + outputs["cls2_loss"].item()
+        assert abs(outputs["loss"].item() - combined_loss) <= 1e-6
+
+        # This should fail, because we're using task 'cls1' with the labels for `cls2`, and the sizes don't match.
+        # This shows up as an IndexError in this case. It'd be nice to catch this kind of error more cleanly in the
+        # model class, but I'm not sure how.
+        instance = Instance(
+            {"text": text_field, "label": label_field2, "task": MetadataField("cls1")}
+        )
+        with pytest.raises(IndexError):
+            outputs = model.forward_on_instance(instance)
+
+        # This one should fail because we now have two things that map to "text" in the backbone,
+        # and they would clobber each other. The name mapping that we have in the model is ok, as
+        # long as our data loader is set up such that we don't batch instances that have both of
+        # these fields at the same time.
+        instance = Instance(
+            {"question": text_field, "text": text_field, "task": MetadataField("cls1")}
+        )
+        with pytest.raises(ValueError, match="duplicate argument text"):
+            outputs = model.forward_on_instance(instance)
diff --git a/tests/models/simple_tagger_test.py b/tests/models/simple_tagger_test.py
index f91e26bd291..2cfab456f37 100644
--- a/tests/models/simple_tagger_test.py
+++ b/tests/models/simple_tagger_test.py
@@ -7,7 +7,7 @@
 from allennlp.common.checks import ConfigurationError
 from allennlp.common.params import Params
 from allennlp.data.dataset_readers import DatasetReader
-from allennlp.data import DataLoader, PyTorchDataLoader
+from allennlp.data.data_loaders import DataLoader, SimpleDataLoader
 from allennlp.models import Model
 from allennlp.training import GradientDescentTrainer, Trainer
 
@@ -55,7 +55,7 @@ def test_regularization(self):
         penalty = self.model.get_regularization_penalty()
         assert penalty is None
 
-        data_loader = PyTorchDataLoader(self.instances, batch_size=32)
+        data_loader = SimpleDataLoader(self.instances, batch_size=32)
         trainer = GradientDescentTrainer(self.model, None, data_loader)  # optimizer,
 
         # You get a RuntimeError if you call `model.forward` twice on the same inputs.
@@ -97,8 +97,11 @@ def setup_method(self):
         params = Params.from_file(param_file)
         self.reader = DatasetReader.from_params(params["dataset_reader"])
         self.data_loader = DataLoader.from_params(
-            dataset=self.instances, params=params["data_loader"]
+            reader=self.reader,
+            data_path=str(self.FIXTURES_ROOT / "data" / "sequence_tagging.tsv"),
+            params=params["data_loader"],
         )
+        self.data_loader.index_with(self.vocab)
         self.trainer = Trainer.from_params(
             model=self.model,
             data_loader=self.data_loader,
diff --git a/tests/modules/attention/scaled_dot_product_attention_test.py b/tests/modules/attention/scaled_dot_product_attention_test.py
new file mode 100644
index 00000000000..247cafc200d
--- /dev/null
+++ b/tests/modules/attention/scaled_dot_product_attention_test.py
@@ -0,0 +1,30 @@
+import torch
+from numpy.testing import assert_almost_equal
+import numpy
+
+from allennlp.common import Params
+from allennlp.common.testing.test_case import AllenNlpTestCase
+from allennlp.modules.attention.attention import Attention
+from allennlp.modules.attention.scaled_dot_product_attention import ScaledDotProductAttention
+
+
+class TestScaledDotProductAttention(AllenNlpTestCase):
+    def test_can_init_scaled_dot(self):
+        legacy_attention = Attention.from_params(
+            Params({"type": "scaled_dot_product", "scaling_factor": 9})
+        )
+        isinstance(legacy_attention, ScaledDotProductAttention)
+
+    def test_scaled_dot_product_similarity(self):
+        attn = ScaledDotProductAttention(9, normalize=False)
+        vector = torch.FloatTensor([[0, 0, 0], [1, 1, 1]])
+        matrix = torch.FloatTensor([[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]]).transpose(
+            -1, -2
+        )
+        output = attn(vector, matrix)
+
+        assert_almost_equal(
+            output.numpy(),
+            numpy.array([[[0.0, 0.0], [2.0, 5.0]], [[0.0, 0.0], [8.0, 11.0]]]),
+            decimal=2,
+        )
diff --git a/tests/modules/elmo_test.py b/tests/modules/elmo_test.py
index 77dfa797fa2..f885f6f39ec 100644
--- a/tests/modules/elmo_test.py
+++ b/tests/modules/elmo_test.py
@@ -12,8 +12,7 @@
 from allennlp.data.fields import TextField
 from allennlp.data.token_indexers.elmo_indexer import ELMoTokenCharactersIndexer
 from allennlp.data.token_indexers.single_id_token_indexer import SingleIdTokenIndexer
-from allennlp.data.dataset_readers.dataset_reader import AllennlpDataset
-from allennlp.data.dataloader import PyTorchDataLoader
+from allennlp.data.data_loaders import SimpleDataLoader
 from allennlp.modules.elmo import _ElmoBiLm, _ElmoCharacterEncoder, Elmo
 from allennlp.modules.token_embedders import ElmoTokenEmbedder
 from allennlp.nn.util import remove_sentence_boundaries
@@ -100,9 +99,9 @@ def test_elmo_bilm(self):
                 instances.append(instance)
 
         vocab = Vocabulary()
-        dataset = AllennlpDataset(instances, vocab)
         # Now finally we can iterate through batches.
-        loader = PyTorchDataLoader(dataset, 3)
+        loader = SimpleDataLoader(instances, 3)
+        loader.index_with(vocab)
         for i, batch in enumerate(loader):
             lm_embeddings = elmo_bilm(batch["elmo"]["character_ids"]["elmo_tokens"])
             top_layer_embeddings, mask = remove_sentence_boundaries(
diff --git a/tests/modules/seq2seq_encoders/pytorch_transformer_wrapper_test.py b/tests/modules/seq2seq_encoders/pytorch_transformer_wrapper_test.py
index 6a6eabe48e6..4dcb3db98d8 100644
--- a/tests/modules/seq2seq_encoders/pytorch_transformer_wrapper_test.py
+++ b/tests/modules/seq2seq_encoders/pytorch_transformer_wrapper_test.py
@@ -31,39 +31,6 @@ def test_positional_embeddings(positional_encoding: Optional[str]):
         assert torch.isfinite(outputs).all()
 
 
-@pytest.mark.parametrize("positional_encoding", [None, "sinusoidal", "embedding"])
-def test_mask_works(positional_encoding: Optional[str]):
-    # All sizes are prime, making them easy to find during debugging.
-    batch_size = 3
-    max_seq_len = 11
-    n_head = 2
-    dims = 7 * n_head
-    transformer = PytorchTransformer(
-        dims, 2, positional_encoding=positional_encoding, num_attention_heads=n_head
-    )
-    transformer.eval()
-
-    with torch.no_grad():
-        # Construct inputs and masks
-        inputs = torch.randn(batch_size, max_seq_len, dims)
-        all_ones_mask = torch.ones(batch_size, max_seq_len, dtype=torch.bool)
-        mask = all_ones_mask.clone()
-        for b in range(batch_size):
-            mask[b, max_seq_len - b :] = False
-        altered_inputs = inputs + (~mask).unsqueeze(2) * 10.0
-
-        # Make sure there is a difference without the mask
-        assert not torch.allclose(
-            transformer(inputs, all_ones_mask), transformer(altered_inputs, all_ones_mask)
-        )
-
-        # Make sure there is no difference with the mask
-        assert torch.allclose(
-            torch.masked_select(transformer(inputs, mask), mask.unsqueeze(2)),
-            torch.masked_select(transformer(altered_inputs, mask), mask.unsqueeze(2)),
-        )
-
-
 @pytest.mark.parametrize("positional_encoding", [None, "sinusoidal", "embedding"])
 def test_positional_encodings(positional_encoding: Optional[str]):
     # All sizes are prime, making them easy to find during debugging.
@@ -96,9 +63,42 @@ def test_positional_encodings(positional_encoding: Optional[str]):
 
         if positional_encoding is None:
             assert torch.allclose(
-                torch.gather(unshuffled_output, 1, shuffle), shuffled_output, atol=2e-7
+                torch.gather(unshuffled_output, 1, shuffle), shuffled_output, atol=2e-5
             )
         else:
             assert not torch.allclose(
-                torch.gather(unshuffled_output, 1, shuffle), shuffled_output, atol=2e-7
+                torch.gather(unshuffled_output, 1, shuffle), shuffled_output, atol=2e-5
             )
+
+
+@pytest.mark.parametrize("positional_encoding", [None, "sinusoidal", "embedding"])
+def test_mask_works(positional_encoding: Optional[str]):
+    # All sizes are prime, making them easy to find during debugging.
+    batch_size = 3
+    max_seq_len = 11
+    n_head = 2
+    dims = 7 * n_head
+    transformer = PytorchTransformer(
+        dims, 2, positional_encoding=positional_encoding, num_attention_heads=n_head
+    )
+    transformer.eval()
+
+    with torch.no_grad():
+        # Construct inputs and masks
+        inputs = torch.randn(batch_size, max_seq_len, dims)
+        all_ones_mask = torch.ones(batch_size, max_seq_len, dtype=torch.bool)
+        mask = all_ones_mask.clone()
+        for b in range(batch_size):
+            mask[b, max_seq_len - b :] = False
+        altered_inputs = inputs + (~mask).unsqueeze(2) * 10.0
+
+        # Make sure there is a difference without the mask
+        assert not torch.allclose(
+            transformer(inputs, all_ones_mask), transformer(altered_inputs, all_ones_mask)
+        )
+
+        # Make sure there is no difference with the mask
+        assert torch.allclose(
+            torch.masked_select(transformer(inputs, mask), mask.unsqueeze(2)),
+            torch.masked_select(transformer(altered_inputs, mask), mask.unsqueeze(2)),
+        )
diff --git a/tests/modules/transformer/activation_layer_test.py b/tests/modules/transformer/activation_layer_test.py
new file mode 100644
index 00000000000..8c1b7ebef26
--- /dev/null
+++ b/tests/modules/transformer/activation_layer_test.py
@@ -0,0 +1,32 @@
+import copy
+import torch
+
+from allennlp.common import Params
+from allennlp.modules.transformer import ActivationLayer
+from allennlp.common.testing import AllenNlpTestCase
+
+
+class TestActivationLayer(AllenNlpTestCase):
+    def setup_method(self):
+        super().setup_method()
+
+        self.params_dict = {
+            "hidden_size": 5,
+            "intermediate_size": 3,
+            "activation": "relu",
+        }
+
+        params = Params(copy.deepcopy(self.params_dict))
+
+        self.activation_layer = ActivationLayer.from_params(params)
+
+    def test_can_construct_from_params(self):
+
+        activation_layer = self.activation_layer
+
+        assert activation_layer.dense.in_features == self.params_dict["hidden_size"]
+        assert activation_layer.dense.out_features == self.params_dict["intermediate_size"]
+
+    def test_forward_runs(self):
+
+        self.activation_layer.forward(torch.randn(7, 5))
diff --git a/tests/modules/transformer/bimodal_attention_test.py b/tests/modules/transformer/bimodal_attention_test.py
new file mode 100644
index 00000000000..40dc81f12de
--- /dev/null
+++ b/tests/modules/transformer/bimodal_attention_test.py
@@ -0,0 +1,55 @@
+import copy
+import torch
+
+from allennlp.common import Params
+from allennlp.modules.transformer import BiModalAttention
+from allennlp.common.testing import AllenNlpTestCase
+
+
+class TestBiModalAttention(AllenNlpTestCase):
+    def setup_method(self):
+        super().setup_method()
+
+        self.params_dict = {
+            "hidden_size1": 6,
+            "hidden_size2": 4,
+            "combined_hidden_size": 16,
+            "num_attention_heads": 2,
+            "dropout1": 0.1,
+            "dropout2": 0.2,
+        }
+
+        params = Params(copy.deepcopy(self.params_dict))
+
+        self.biattention = BiModalAttention.from_params(params)
+
+    def test_can_construct_from_params(self):
+
+        biattention = self.biattention
+
+        assert biattention.num_attention_heads == self.params_dict["num_attention_heads"]
+        assert biattention.attention_head_size == int(
+            self.params_dict["combined_hidden_size"] / self.params_dict["num_attention_heads"]
+        )
+        assert (
+            biattention.all_head_size
+            == self.params_dict["num_attention_heads"] * biattention.attention_head_size
+        )
+        assert biattention.query1.in_features == self.params_dict["hidden_size1"]
+        assert biattention.key1.in_features == self.params_dict["hidden_size1"]
+        assert biattention.value1.in_features == self.params_dict["hidden_size1"]
+        assert biattention.dropout1.p == self.params_dict["dropout1"]
+
+        assert biattention.query2.in_features == self.params_dict["hidden_size2"]
+        assert biattention.key2.in_features == self.params_dict["hidden_size2"]
+        assert biattention.value2.in_features == self.params_dict["hidden_size2"]
+        assert biattention.dropout2.p == self.params_dict["dropout2"]
+
+    def test_forward_runs(self):
+
+        self.biattention.forward(
+            torch.randn(2, 3, 6),
+            torch.randn(2, 3, 4),
+            torch.randint(0, 2, (2, 2, 3, 3)) == 1,  # creating boolean tensors
+            torch.randint(0, 2, (2, 2, 3, 3)) == 1,
+        )
diff --git a/tests/modules/transformer/bimodal_encoder_test.py b/tests/modules/transformer/bimodal_encoder_test.py
new file mode 100644
index 00000000000..b95af3bfa1f
--- /dev/null
+++ b/tests/modules/transformer/bimodal_encoder_test.py
@@ -0,0 +1,95 @@
+import copy
+import torch
+from allennlp.common import Params
+from allennlp.common import cached_transformers
+from allennlp.common.testing import assert_equal_parameters
+from allennlp.modules.transformer import BiModalEncoder
+from allennlp.common.testing import AllenNlpTestCase
+
+
+class TestBiModalEncoder(AllenNlpTestCase):
+    def setup_method(self):
+        super().setup_method()
+
+        self.params_dict = {
+            "num_hidden_layers1": 3,
+            "num_hidden_layers2": 3,
+            "hidden_size1": 12,
+            "hidden_size2": 12,
+            "combined_hidden_size": 12,
+            "intermediate_size1": 3,
+            "intermediate_size2": 3,
+            "num_attention_heads1": 4,
+            "num_attention_heads2": 6,
+            "combined_num_attention_heads": 2,
+            "attention_dropout1": 0.1,
+            "hidden_dropout1": 0.2,
+            "attention_dropout2": 0.1,
+            "hidden_dropout2": 0.2,
+            "activation": "relu",
+            "biattention_id1": [1, 2],
+            "biattention_id2": [1, 2],
+            "fixed_layer1": 1,
+            "fixed_layer2": 1,
+        }
+
+        params = Params(copy.deepcopy(self.params_dict))
+
+        self.bimodal_encoder = BiModalEncoder.from_params(params)
+
+        self.pretrained = cached_transformers.get("bert-base-uncased", False)
+
+    def test_can_construct_from_params(self):
+
+        modules = dict(self.bimodal_encoder.named_modules())
+        assert len(modules["layers1"]) == self.params_dict["num_hidden_layers1"]
+        assert len(modules["layers2"]) == self.params_dict["num_hidden_layers2"]
+
+    def test_forward_runs(self):
+
+        embedding1 = torch.randn(16, 34, self.params_dict["hidden_size1"])
+        embedding2 = torch.randn(16, 2, self.params_dict["hidden_size2"])
+        attn_mask1 = torch.randint(0, 2, (16, 1, 1, 34)) == 1
+        attn_mask2 = torch.randint(0, 2, (16, 1, 1, 2)) == 1
+
+        self.bimodal_encoder.forward(embedding1, embedding2, attn_mask1, attn_mask2)
+
+    def test_loading_from_pretrained_weights(self):
+        pretrained_module = self.pretrained.encoder
+        required_kwargs = [
+            "num_hidden_layers2",
+            "hidden_size2",
+            "combined_hidden_size",
+            "intermediate_size2",
+            "num_attention_heads2",
+            "combined_num_attention_heads",
+            "attention_dropout2",
+            "hidden_dropout2",
+            "biattention_id1",
+            "biattention_id2",
+            "fixed_layer1",
+            "fixed_layer2",
+        ]
+        kwargs = {key: self.params_dict[key] for key in required_kwargs}
+        module = BiModalEncoder.from_pretrained_module(pretrained_module, **kwargs)
+        mapping = {
+            val: key
+            for key, val in module._construct_default_mapping(
+                pretrained_module, "huggingface", {}
+            ).items()
+        }
+        assert_equal_parameters(
+            pretrained_module,
+            module,
+            ignore_missing=True,
+            mapping=mapping,
+        )
+
+    def test_default_parameters(self):
+        encoder = BiModalEncoder()
+        embedding1 = torch.randn(16, 34, 1024)
+        embedding2 = torch.randn(16, 2, 1024)
+        attn_mask1 = torch.randint(0, 2, (16, 1, 1, 34)) == 1
+        attn_mask2 = torch.randint(0, 2, (16, 1, 1, 2)) == 1
+
+        encoder.forward(embedding1, embedding2, attn_mask1, attn_mask2)
diff --git a/tests/modules/transformer/output_layer_test.py b/tests/modules/transformer/output_layer_test.py
new file mode 100644
index 00000000000..3e1c716c8ee
--- /dev/null
+++ b/tests/modules/transformer/output_layer_test.py
@@ -0,0 +1,36 @@
+import copy
+import torch
+
+from allennlp.common import Params
+from allennlp.modules.transformer import OutputLayer
+from allennlp.common.testing import AllenNlpTestCase
+
+
+class TestOutputLayer(AllenNlpTestCase):
+    def setup_method(self):
+        super().setup_method()
+
+        self.params_dict = {
+            "input_size": 3,
+            "hidden_size": 5,
+            "dropout": 0.1,
+        }
+
+        params = Params(copy.deepcopy(self.params_dict))
+
+        self.output_layer = OutputLayer.from_params(params)
+
+    def test_can_construct_from_params(self):
+
+        output_layer = self.output_layer
+
+        assert output_layer.dense.in_features == self.params_dict["input_size"]
+        assert output_layer.dense.out_features == self.params_dict["hidden_size"]
+
+        assert output_layer.layer_norm.normalized_shape[0] == self.params_dict["hidden_size"]
+
+        assert output_layer.dropout.p == self.params_dict["dropout"]
+
+    def test_forward_runs(self):
+
+        self.output_layer.forward(torch.randn(3, 3), torch.randn(3, 5))
diff --git a/tests/modules/transformer/positional_encoding_test.py b/tests/modules/transformer/positional_encoding_test.py
new file mode 100644
index 00000000000..0a29bd6e49e
--- /dev/null
+++ b/tests/modules/transformer/positional_encoding_test.py
@@ -0,0 +1,76 @@
+import copy
+import torch
+import numpy as np
+from allennlp.common import Params
+from allennlp.modules.transformer import SinusoidalPositionalEncoding
+from allennlp.common.testing import AllenNlpTestCase
+
+
+class TestSinusoidalPositionalEncoding(AllenNlpTestCase):
+    def setup_method(self):
+        super().setup_method()
+
+        self.params_dict = {
+            "min_timescale": 1.0,
+            "max_timescale": 1.0e4,
+        }
+
+        params = Params(copy.deepcopy(self.params_dict))
+
+        self.positional_encoding = SinusoidalPositionalEncoding.from_params(params)
+
+    def test_can_construct_from_params(self):
+        assert self.positional_encoding.min_timescale == self.params_dict["min_timescale"]
+        assert self.positional_encoding.max_timescale == self.params_dict["max_timescale"]
+
+    def test_forward(self):
+        tensor2tensor_result = np.asarray(
+            [
+                [0.00000000e00, 0.00000000e00, 1.00000000e00, 1.00000000e00],
+                [8.41470957e-01, 9.99999902e-05, 5.40302277e-01, 1.00000000e00],
+                [9.09297407e-01, 1.99999980e-04, -4.16146845e-01, 1.00000000e00],
+            ]
+        )
+
+        tensor = torch.zeros([2, 3, 4])
+        result = self.positional_encoding(tensor)
+        np.testing.assert_almost_equal(result[0].detach().cpu().numpy(), tensor2tensor_result)
+        np.testing.assert_almost_equal(result[1].detach().cpu().numpy(), tensor2tensor_result)
+
+        # Check case with odd number of dimensions.
+        tensor2tensor_result = np.asarray(
+            [
+                [
+                    0.00000000e00,
+                    0.00000000e00,
+                    0.00000000e00,
+                    1.00000000e00,
+                    1.00000000e00,
+                    1.00000000e00,
+                    0.00000000e00,
+                ],
+                [
+                    8.41470957e-01,
+                    9.99983307e-03,
+                    9.99999902e-05,
+                    5.40302277e-01,
+                    9.99949992e-01,
+                    1.00000000e00,
+                    0.00000000e00,
+                ],
+                [
+                    9.09297407e-01,
+                    1.99986659e-02,
+                    1.99999980e-04,
+                    -4.16146815e-01,
+                    9.99800026e-01,
+                    1.00000000e00,
+                    0.00000000e00,
+                ],
+            ]
+        )
+
+        tensor = torch.zeros([2, 3, 7])
+        result = self.positional_encoding(tensor)
+        np.testing.assert_almost_equal(result[0].detach().cpu().numpy(), tensor2tensor_result)
+        np.testing.assert_almost_equal(result[1].detach().cpu().numpy(), tensor2tensor_result)
diff --git a/tests/modules/transformer/self_attention_test.py b/tests/modules/transformer/self_attention_test.py
new file mode 100644
index 00000000000..a1205b00bbd
--- /dev/null
+++ b/tests/modules/transformer/self_attention_test.py
@@ -0,0 +1,167 @@
+import copy
+import torch
+import pytest
+
+from allennlp.common import Params
+from allennlp.common import cached_transformers
+from allennlp.common.testing import assert_equal_parameters
+
+from allennlp.modules.transformer import SelfAttention
+from allennlp.common.testing import AllenNlpTestCase
+
+from transformers.models.bert.configuration_bert import BertConfig
+from transformers.models.bert.modeling_bert import BertSelfAttention
+from transformers.models.roberta.configuration_roberta import RobertaConfig
+from transformers.models.roberta.modeling_roberta import RobertaSelfAttention
+from transformers.models.electra.configuration_electra import ElectraConfig
+from transformers.models.electra.modeling_electra import ElectraSelfAttention
+from transformers.models.distilbert.configuration_distilbert import DistilBertConfig
+from transformers.models.distilbert.modeling_distilbert import MultiHeadSelfAttention
+
+PARAMS_DICT = {
+    "hidden_size": 6,
+    "num_attention_heads": 2,
+    "dropout": 0.0,
+}
+
+
+def get_modules(params_dict):
+    modules = {}
+    params = copy.deepcopy(params_dict)
+    params["attention_probs_dropout_prob"] = params.pop("dropout")
+
+    # bert, roberta, electra self attentions have the same code.
+
+    torch.manual_seed(1234)
+    hf_module = BertSelfAttention(BertConfig(**params))
+    modules["bert"] = hf_module
+
+    torch.manual_seed(1234)
+    hf_module = RobertaSelfAttention(RobertaConfig(**params))
+    modules["roberta"] = hf_module
+
+    torch.manual_seed(1234)
+    hf_module = ElectraSelfAttention(ElectraConfig(**params))
+    modules["electra"] = hf_module
+
+    torch.manual_seed(1234)
+    distilparams = copy.deepcopy(params_dict)
+    distilparams["n_heads"] = distilparams.pop("num_attention_heads")
+    distilparams["dim"] = distilparams.pop("hidden_size")
+    distilparams["attention_dropout"] = distilparams.pop("dropout")
+    hf_module = MultiHeadSelfAttention(DistilBertConfig(**distilparams))
+    modules["distilbert"] = hf_module
+
+    return modules
+
+
+class TestSelfAttention(AllenNlpTestCase):
+    def setup_method(self):
+        super().setup_method()
+
+        self.params_dict = {key: val for key, val in PARAMS_DICT.items()}
+
+        params = Params(copy.deepcopy(self.params_dict))
+
+        self.self_attention = SelfAttention.from_params(params)
+
+    def test_can_construct_from_params(self):
+        assert self.self_attention.num_attention_heads == self.params_dict["num_attention_heads"]
+        assert self.self_attention.attention_head_size == int(
+            self.params_dict["hidden_size"] / self.params_dict["num_attention_heads"]
+        )
+
+        assert (
+            self.self_attention.all_head_size
+            == self.params_dict["num_attention_heads"] * self.self_attention.attention_head_size
+        )
+
+        assert self.self_attention.query.in_features == self.params_dict["hidden_size"]
+        assert self.self_attention.key.in_features == self.params_dict["hidden_size"]
+        assert self.self_attention.value.in_features == self.params_dict["hidden_size"]
+
+        assert self.self_attention.dropout.p == self.params_dict["dropout"]
+
+    @pytest.mark.parametrize("module_name, hf_module", get_modules(PARAMS_DICT).items())
+    def test_forward_against_huggingface_output(self, module_name, hf_module):
+        hidden_states = torch.randn(2, 3, 6)
+        attention_mask = torch.tensor([[0, 1, 0], [1, 1, 0]])
+
+        torch.manual_seed(1234)
+        self_attention = SelfAttention.from_pretrained_module(hf_module)
+
+        output = self_attention.forward(hidden_states, attention_mask=attention_mask)
+        if module_name == "distilbert":
+            hf_output = hf_module.forward(
+                hidden_states, hidden_states, hidden_states, mask=attention_mask
+            )
+        else:
+            # We do this because bert, roberta, electra process the attention_mask at the model level.
+            attention_mask_hf = (attention_mask == 0).view((2, 1, 1, 3)).expand(2, 2, 3, 3) * -10e5
+            hf_output = hf_module.forward(hidden_states, attention_mask=attention_mask_hf)
+
+        assert torch.allclose(output[0], hf_output[0])
+
+    @pytest.mark.parametrize(
+        "pretrained_name",
+        [
+            "bert-base-uncased",
+            "roberta-base",
+            "google/electra-base-generator",
+            "distilbert-base-uncased",
+        ],
+    )
+    def test_loading_from_pretrained_weights_using_model_name(self, pretrained_name):
+
+        torch.manual_seed(1234)
+        pretrained = cached_transformers.get(pretrained_name, False)
+
+        if "distilbert" in pretrained_name:
+            encoder = pretrained.transformer
+        else:
+            encoder = pretrained.encoder
+        # Hacky way to get a bert layer.
+        for i, pretrained_module in enumerate(encoder.layer.modules()):
+            if i == 1:
+                break
+
+        # Get the self attention layer.
+        if "distilbert" in pretrained_name:
+            pretrained_module = pretrained_module.attention
+        else:
+            pretrained_module = pretrained_module.attention.self
+
+        torch.manual_seed(1234)
+        module = SelfAttention.from_pretrained_module(pretrained_name)
+        mapping = {
+            val: key
+            for key, val in module._construct_default_mapping(
+                pretrained_module, "huggingface", {}
+            ).items()
+        }
+        assert_equal_parameters(pretrained_module, module, mapping=mapping)
+
+        batch_size = 2
+        seq_len = 3
+        dim = module.query.in_features
+        hidden_states = torch.randn(batch_size, seq_len, dim)
+        attention_mask = torch.randint(0, 2, (batch_size, 1, 1, seq_len))
+
+        # setting to eval mode to avoid non-deterministic dropout.
+        module = module.eval()
+        pretrained_module = pretrained_module.eval()
+
+        torch.manual_seed(1234)
+        output = module.forward(hidden_states, attention_mask=attention_mask.squeeze())[0]
+        if "distilbert" in pretrained_name:
+            torch.manual_seed(1234)
+            hf_output = pretrained_module.forward(
+                hidden_states, hidden_states, hidden_states, mask=attention_mask
+            )[0]
+        else:
+            # The attn_mask is processed outside the self attention module in HF bert models.
+            attention_mask = (~(attention_mask == 1)) * -10e5
+            torch.manual_seed(1234)
+            hf_output = pretrained_module.forward(hidden_states, attention_mask=attention_mask)[0]
+
+        assert torch.allclose(output, hf_output)
diff --git a/tests/modules/transformer/toolkit_test.py b/tests/modules/transformer/toolkit_test.py
new file mode 100644
index 00000000000..cd1bf60e9fd
--- /dev/null
+++ b/tests/modules/transformer/toolkit_test.py
@@ -0,0 +1,146 @@
+import torch
+from overrides import overrides
+from transformers.models.albert.modeling_albert import AlbertEmbeddings
+
+from allennlp.common import cached_transformers
+from allennlp.common.testing import assert_equal_parameters
+from allennlp.data.vocabulary import Vocabulary
+from allennlp.modules.token_embedders import Embedding, TokenEmbedder
+from allennlp.modules.transformer import TransformerStack, TransformerEmbeddings
+from allennlp.common.testing import AllenNlpTestCase
+
+
+class TestTransformerToolkit(AllenNlpTestCase):
+    def setup_method(self):
+        super().setup_method()
+        self.vocab = Vocabulary()
+        # populate vocab.
+        self.vocab.add_token_to_namespace("word")
+        self.vocab.add_token_to_namespace("the")
+        self.vocab.add_token_to_namespace("an")
+
+    def test_create_embedder_using_toolkit(self):
+
+        embedding_file = str(self.FIXTURES_ROOT / "embeddings/glove.6B.300d.sample.txt.gz")
+
+        class TinyTransformer(TokenEmbedder):
+            def __init__(self, vocab, embedding_dim, hidden_size, intermediate_size):
+                super().__init__()
+                self.embeddings = Embedding(
+                    pretrained_file=embedding_file,
+                    embedding_dim=embedding_dim,
+                    projection_dim=hidden_size,
+                    vocab=vocab,
+                )
+
+                self.transformer = TransformerStack(
+                    num_hidden_layers=4,
+                    hidden_size=hidden_size,
+                    intermediate_size=intermediate_size,
+                )
+
+            @overrides
+            def forward(self, token_ids: torch.LongTensor):
+                x = self.embeddings(token_ids)
+                x = self.transformer(x)
+                return x
+
+        tiny = TinyTransformer(self.vocab, embedding_dim=300, hidden_size=80, intermediate_size=40)
+        tiny.forward(torch.LongTensor([[0, 1, 2]]))
+
+    def test_use_first_four_layers_of_pretrained(self):
+        pretrained = cached_transformers.get("bert-base-uncased", False)
+
+        class SmallTransformer(TokenEmbedder):
+            def __init__(self):
+                super().__init__()
+                self.embeddings = TransformerEmbeddings.from_pretrained_module(pretrained)
+
+                self.transformer = TransformerStack.from_pretrained_module(
+                    pretrained, num_hidden_layers=4
+                )
+
+            @overrides
+            def forward(self, token_ids: torch.LongTensor):
+                x = self.embeddings(token_ids)
+                x = self.transformer(x)
+                return x
+
+        small = SmallTransformer()
+        assert len(small.transformer.layers) == 4
+        small.forward(torch.LongTensor([[0, 1, 2]]))
+
+    def test_use_selected_layers_of_bert_for_different_purposes(self):
+        class MediumTransformer(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.embeddings = TransformerEmbeddings.from_pretrained_module("bert-base-uncased")
+                self.separate_transformer = TransformerStack.from_pretrained_module(
+                    "bert-base-uncased", num_hidden_layers=range(0, 8)
+                )
+                self.combined_transformer = TransformerStack.from_pretrained_module(
+                    "bert-base-uncased",
+                    num_hidden_layers=range(8, 12),
+                )
+
+            @overrides
+            def forward(
+                self,
+                left_token_ids: torch.LongTensor,
+                right_token_ids: torch.LongTensor,
+            ):
+
+                left = self.embeddings(left_token_ids)
+                left = self.separate_transformer(left)
+
+                right = self.embeddings(right_token_ids)
+                right = self.separate_transformer(right)
+
+                # combine the sequences in some meaningful way. here, we just add them.
+                # combined = combine_masked_sequences(left, left_mask, right, right_mask)
+                combined = left + right
+
+                return self.combined_transformer(combined)
+
+        medium = MediumTransformer()
+        assert (len(medium.separate_transformer.layers)) == 8
+        assert (len(medium.combined_transformer.layers)) == 4
+
+        pretrained = cached_transformers.get("bert-base-uncased", False)
+        pretrained_layers = dict(pretrained.encoder.layer.named_modules())
+
+        medium_layers = dict(medium.combined_transformer.layers.named_modules())
+
+        assert_equal_parameters(
+            medium_layers["0"], pretrained_layers["8"], TransformerStack._huggingface_mapping
+        )
+        assert_equal_parameters(
+            medium_layers["1"], pretrained_layers["9"], TransformerStack._huggingface_mapping
+        )
+        assert_equal_parameters(
+            medium_layers["2"], pretrained_layers["10"], TransformerStack._huggingface_mapping
+        )
+        assert_equal_parameters(
+            medium_layers["3"], pretrained_layers["11"], TransformerStack._huggingface_mapping
+        )
+
+    def test_combination_of_two_different_berts(self):
+        # Regular BERT, but with AlBERT's special compressed embedding scheme
+
+        class AlmostRegularTransformer(TokenEmbedder):
+            def __init__(self):
+                super().__init__()
+                self.embeddings = TransformerEmbeddings.get_relevant_module("albert-base-v2")
+                self.transformer = TransformerStack.from_pretrained_module("bert-base-uncased")
+                # We want to tune only the embeddings, because that's our experiment.
+                self.transformer.requires_grad = False
+
+            @overrides
+            def forward(self, token_ids: torch.LongTensor, mask: torch.BoolTensor):
+                x = self.embeddings(token_ids, mask)
+                x = self.transformer(x)
+                return x
+
+        almost = AlmostRegularTransformer()
+        assert len(almost.transformer.layers) == 12
+        assert isinstance(almost.embeddings, AlbertEmbeddings)
diff --git a/tests/modules/transformer/transformer_embeddings_test.py b/tests/modules/transformer/transformer_embeddings_test.py
new file mode 100644
index 00000000000..08212ee15c9
--- /dev/null
+++ b/tests/modules/transformer/transformer_embeddings_test.py
@@ -0,0 +1,308 @@
+import pytest
+import copy
+import torch
+from torch.testing import assert_allclose
+
+from allennlp.common import Params, FromParams
+from allennlp.common import cached_transformers
+
+from transformers.models.bert.configuration_bert import BertConfig
+from transformers.models.bert.modeling_bert import BertEmbeddings
+from transformers.models.albert.configuration_albert import AlbertConfig
+from transformers.models.albert.modeling_albert import AlbertEmbeddings
+
+from allennlp.common.testing import assert_equal_parameters
+from allennlp.modules.transformer import (
+    TransformerEmbeddings,
+    ImageFeatureEmbeddings,
+    TransformerModule,
+)
+from allennlp.common.testing import AllenNlpTestCase
+
+PARAMS_DICT = {
+    "vocab_size": 20,
+    "embedding_size": 5,
+    "pad_token_id": 0,
+    "max_position_embeddings": 3,
+    "type_vocab_size": 2,
+    "dropout": 0.5,
+}
+
+
+def get_modules(params_dict):
+    modules = {}
+    params = copy.deepcopy(params_dict)
+
+    params["hidden_dropout_prob"] = params.pop("dropout")
+    params["hidden_size"] = params.pop("embedding_size")
+
+    # bert, roberta, electra self attentions have the same code.
+
+    torch.manual_seed(1234)
+    hf_module = BertEmbeddings(BertConfig(**params))
+    modules["bert"] = hf_module
+
+    albertparams = copy.deepcopy(params_dict)
+    albertparams["hidden_dropout_prob"] = albertparams.pop("dropout")
+
+    torch.manual_seed(1234)
+    hf_module = AlbertEmbeddings(AlbertConfig(**albertparams))
+    modules["albert"] = hf_module
+
+    return modules
+
+
+class TestTransformerEmbeddings(AllenNlpTestCase):
+    def setup_method(self):
+        super().setup_method()
+
+        self.params_dict = {key: val for key, val in PARAMS_DICT.items()}
+
+        params = Params(copy.deepcopy(self.params_dict))
+
+        self.transformer_embeddings = TransformerEmbeddings.from_params(params)
+
+    def test_can_construct_from_params(self):
+
+        transformer_embeddings = self.transformer_embeddings.embeddings
+
+        assert (
+            transformer_embeddings.word_embeddings.num_embeddings == self.params_dict["vocab_size"]
+        )
+        assert (
+            transformer_embeddings.word_embeddings.embedding_dim
+            == self.params_dict["embedding_size"]
+        )
+        assert (
+            transformer_embeddings.word_embeddings.padding_idx == self.params_dict["pad_token_id"]
+        )
+
+        assert (
+            transformer_embeddings.position_embeddings.num_embeddings
+            == self.params_dict["max_position_embeddings"]
+        )
+        assert (
+            transformer_embeddings.position_embeddings.embedding_dim
+            == self.params_dict["embedding_size"]
+        )
+
+        assert (
+            transformer_embeddings.token_type_embeddings.num_embeddings
+            == self.params_dict["type_vocab_size"]
+        )
+        assert (
+            transformer_embeddings.token_type_embeddings.embedding_dim
+            == self.params_dict["embedding_size"]
+        )
+
+        assert (
+            self.transformer_embeddings.layer_norm.normalized_shape[0]
+            == self.params_dict["embedding_size"]
+        )
+
+        assert self.transformer_embeddings.dropout.p == self.params_dict["dropout"]
+
+    def test_sanity(self):
+        class TextEmbeddings(TransformerModule, FromParams):
+            def __init__(
+                self,
+                vocab_size: int,
+                hidden_size: int,
+                pad_token_id: int,
+                max_position_embeddings: int,
+                type_vocab_size: int,
+                dropout: float,
+            ):
+                super().__init__()
+                self.word_embeddings = torch.nn.Embedding(
+                    vocab_size, hidden_size, padding_idx=pad_token_id
+                )
+                self.position_embeddings = torch.nn.Embedding(max_position_embeddings, hidden_size)
+                self.token_type_embeddings = torch.nn.Embedding(type_vocab_size, hidden_size)
+
+                self.layer_norm = torch.nn.LayerNorm(hidden_size, eps=1e-12)
+                self.dropout = torch.nn.Dropout(dropout)
+
+            def forward(
+                self, input_ids=None, token_type_ids=None, position_ids=None, inputs_embeds=None
+            ):
+                if input_ids is not None:
+                    input_shape = input_ids.size()
+                else:
+                    input_shape = inputs_embeds.size()[:-1]
+
+                seq_length = input_shape[1]
+                device = input_ids.device if input_ids is not None else inputs_embeds.device
+                if position_ids is None:
+                    position_ids = torch.arange(seq_length, dtype=torch.long, device=device)
+                    position_ids = position_ids.unsqueeze(0).expand(input_shape)
+                if token_type_ids is None:
+                    token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
+
+                if inputs_embeds is None:
+                    inputs_embeds = self.word_embeddings(input_ids)
+                position_embeddings = self.position_embeddings(position_ids)
+                token_type_embeddings = self.token_type_embeddings(token_type_ids)
+
+                embeddings = inputs_embeds + position_embeddings + token_type_embeddings
+                embeddings = self.layer_norm(embeddings)
+                embeddings = self.dropout(embeddings)
+                return embeddings
+
+        torch.manual_seed(23)
+        text = TextEmbeddings(10, 5, 2, 3, 7, 0.0)
+        torch.manual_seed(23)
+        transformer = TransformerEmbeddings(10, 5, 2, 3, 7, 0.0)
+
+        input_ids = torch.tensor([[1, 2]])
+        token_type_ids = torch.tensor([[1, 0]], dtype=torch.long)
+        position_ids = torch.tensor([[0, 1]])
+
+        text_output = text.forward(input_ids, token_type_ids, position_ids)
+        transformer_output = transformer.forward(input_ids, token_type_ids, position_ids)
+
+        assert_allclose(text_output, transformer_output)
+
+    def test_forward_runs_with_inputs(self):
+        input_ids = torch.tensor([[1, 2]])
+        token_type_ids = torch.tensor([[1, 0]], dtype=torch.long)
+        position_ids = torch.tensor([[0, 1]])
+        self.transformer_embeddings.forward(
+            input_ids=input_ids, token_type_ids=token_type_ids, position_ids=position_ids
+        )
+
+    def test_output_size(self):
+        input_ids = torch.tensor([[1, 2]])
+        token_type_ids = torch.tensor([[1, 0]], dtype=torch.long)
+        position_ids = torch.tensor([[0, 1]])
+        params = copy.deepcopy(self.params_dict)
+        params["output_size"] = 7
+        params = Params(params)
+        module = TransformerEmbeddings.from_params(params)
+        output = module.forward(
+            input_ids=input_ids, token_type_ids=token_type_ids, position_ids=position_ids
+        )
+
+        assert output.shape[-1] == 7
+
+    def test_no_token_type_layer(self):
+        params = copy.deepcopy(self.params_dict)
+        params["type_vocab_size"] = 0
+        params = Params(params)
+        module = TransformerEmbeddings.from_params(params)
+
+        assert len(module.embeddings) == 2
+
+    @pytest.mark.parametrize(
+        "pretrained_name",
+        [
+            "bert-base-uncased",
+            "albert-base-v2",
+        ],
+    )
+    def test_loading_from_pretrained_weights_using_model_name(self, pretrained_name):
+        pretrained_module = cached_transformers.get(pretrained_name, False).embeddings
+        module = TransformerEmbeddings.from_pretrained_module(pretrained_name)
+        mapping = {
+            val: key
+            for key, val in module._construct_default_mapping(
+                pretrained_module, "huggingface", {}
+            ).items()
+        }
+        missing = assert_equal_parameters(pretrained_module, module, mapping=mapping)
+        assert len(missing) == 0
+
+    @pytest.mark.parametrize("module_name, hf_module", get_modules(PARAMS_DICT).items())
+    def test_forward_against_huggingface_output(self, module_name, hf_module):
+        input_ids = torch.tensor([[1, 2]])
+        token_type_ids = torch.tensor([[1, 0]], dtype=torch.long)
+        position_ids = torch.tensor([[0, 1]])
+
+        torch.manual_seed(1234)
+        embeddings = TransformerEmbeddings.from_pretrained_module(hf_module)
+
+        torch.manual_seed(1234)
+        embeddings = embeddings.eval()  # setting to eval mode to avoid non-deterministic dropout.
+        output = embeddings.forward(
+            input_ids=input_ids, token_type_ids=token_type_ids, position_ids=position_ids
+        )
+
+        torch.manual_seed(1234)
+        hf_module = hf_module.eval()  # setting to eval mode to avoid non-deterministic dropout.
+        hf_output = hf_module.forward(
+            input_ids=input_ids, token_type_ids=token_type_ids, position_ids=position_ids
+        )
+
+        assert torch.allclose(output, hf_output)
+
+
+class TestImageFeatureEmbeddings(AllenNlpTestCase):
+    def setup_method(self):
+        super().setup_method()
+
+        self.params_dict = {"feature_size": 3, "embedding_size": 5, "dropout": 0.1}
+
+        params = Params(copy.deepcopy(self.params_dict))
+
+        self.img_embeddings = ImageFeatureEmbeddings.from_params(params)
+
+    def test_can_construct_from_params(self):
+        assert (
+            self.img_embeddings.embeddings.image_embeddings.in_features
+            == self.params_dict["feature_size"]
+        )
+        assert (
+            self.img_embeddings.embeddings.image_embeddings.out_features
+            == self.params_dict["embedding_size"]
+        )
+        assert (
+            self.img_embeddings.embeddings.location_embeddings.out_features
+            == self.params_dict["embedding_size"]
+        )
+        assert self.img_embeddings.dropout.p == self.params_dict["dropout"]
+
+    def test_forward_runs_with_inputs(self):
+        batch_size = 2
+        feature_dim = self.params_dict["feature_size"]
+        image_feature = torch.randn(batch_size, feature_dim)
+        image_location = torch.randn(batch_size, 4)
+        self.img_embeddings.forward(image_feature, image_location)
+
+    def test_sanity(self):
+        class OldImageFeatureEmbeddings(TransformerModule, FromParams):
+            """Construct the embeddings from image, spatial location (omit now) and
+            token_type embeddings.
+            """
+
+            def __init__(self, feature_size: int, embedding_size: int, dropout: float = 0.0):
+                super().__init__()
+
+                self.image_embeddings = torch.nn.Linear(feature_size, embedding_size)
+                self.image_location_embeddings = torch.nn.Linear(4, embedding_size)
+                self.layer_norm = torch.nn.LayerNorm(embedding_size, eps=1e-12)
+                self.dropout = torch.nn.Dropout(dropout)
+
+            def forward(self, image_feature: torch.Tensor, image_location: torch.Tensor):
+                img_embeddings = self.image_embeddings(image_feature)
+                loc_embeddings = self.image_location_embeddings(image_location)
+                embeddings = self.layer_norm(img_embeddings + loc_embeddings)
+                embeddings = self.dropout(embeddings)
+
+                return embeddings
+
+        torch.manual_seed(23)
+        old = OldImageFeatureEmbeddings(**self.params_dict)
+        torch.manual_seed(23)
+        now = ImageFeatureEmbeddings(**self.params_dict)
+
+        batch_size = 2
+
+        image_feature = torch.randn(batch_size, self.params_dict["feature_size"])
+        image_location = torch.randn(batch_size, 4)
+
+        torch.manual_seed(23)
+        old_output = old.forward(image_feature, image_location)
+        torch.manual_seed(23)
+        now_output = now.forward(image_feature, image_location)
+
+        assert_allclose(old_output, now_output)
diff --git a/tests/modules/transformer/transformer_layer_test.py b/tests/modules/transformer/transformer_layer_test.py
new file mode 100644
index 00000000000..1ecf183eace
--- /dev/null
+++ b/tests/modules/transformer/transformer_layer_test.py
@@ -0,0 +1,382 @@
+import copy
+import torch
+import pytest
+
+from allennlp.common import Params
+from allennlp.common import cached_transformers
+from allennlp.common.testing import assert_equal_parameters
+from allennlp.modules.transformer import AttentionLayer, TransformerLayer
+from allennlp.common.testing import AllenNlpTestCase
+
+from transformers.models.bert.configuration_bert import BertConfig
+from transformers.models.bert.modeling_bert import BertAttention, BertLayer
+from transformers.models.roberta.configuration_roberta import RobertaConfig
+from transformers.models.roberta.modeling_roberta import RobertaAttention, RobertaLayer
+from transformers.models.electra.configuration_electra import ElectraConfig
+from transformers.models.electra.modeling_electra import ElectraAttention, ElectraLayer
+
+ATTENTION_PARAMS_DICT = {
+    "hidden_size": 6,
+    "num_attention_heads": 2,
+    "attention_dropout": 0.1,
+    "hidden_dropout": 0.2,
+}
+
+
+def get_attention_modules(params_dict):
+    modules = {}
+    params = copy.deepcopy(params_dict)
+    params["attention_probs_dropout_prob"] = params.pop("attention_dropout")
+    params["hidden_dropout_prob"] = params.pop("hidden_dropout")
+
+    torch.manual_seed(1234)
+    hf_module = BertAttention(BertConfig(**params))
+    modules["bert"] = hf_module
+
+    torch.manual_seed(1234)
+    hf_module = RobertaAttention(RobertaConfig(**params))
+    modules["roberta"] = hf_module
+
+    torch.manual_seed(1234)
+    hf_module = ElectraAttention(ElectraConfig(**params))
+    modules["electra"] = hf_module
+
+    return modules
+
+
+class TestAttentionLayer(AllenNlpTestCase):
+    def setup_method(self):
+        super().setup_method()
+
+        self.params_dict = {
+            "hidden_size": 6,
+            "num_attention_heads": 2,
+            "attention_dropout": 0.1,
+            "hidden_dropout": 0.2,
+        }
+
+        params = Params(copy.deepcopy(self.params_dict))
+
+        self.attention_layer = AttentionLayer.from_params(params)
+
+    def test_can_construct_from_params(self):
+
+        attention_layer = self.attention_layer
+
+        assert attention_layer.self.num_attention_heads == self.params_dict["num_attention_heads"]
+        assert attention_layer.self.attention_head_size == int(
+            self.params_dict["hidden_size"] / self.params_dict["num_attention_heads"]
+        )
+        assert (
+            attention_layer.self.all_head_size
+            == self.params_dict["num_attention_heads"] * attention_layer.self.attention_head_size
+        )
+        assert attention_layer.self.query.in_features == self.params_dict["hidden_size"]
+        assert attention_layer.self.key.in_features == self.params_dict["hidden_size"]
+        assert attention_layer.self.value.in_features == self.params_dict["hidden_size"]
+        assert attention_layer.self.dropout.p == self.params_dict["attention_dropout"]
+
+        assert attention_layer.output.dense.in_features == self.params_dict["hidden_size"]
+        assert attention_layer.output.dense.out_features == self.params_dict["hidden_size"]
+        assert (
+            attention_layer.output.layer_norm.normalized_shape[0] == self.params_dict["hidden_size"]
+        )
+        assert attention_layer.output.dropout.p == self.params_dict["hidden_dropout"]
+
+    def test_forward_runs(self):
+        attention_mask = torch.tensor([[0, 1, 0], [1, 1, 0]])
+        self.attention_layer.forward(torch.randn(2, 3, 6), attention_mask=attention_mask)
+
+    @pytest.mark.parametrize(
+        "module_name, hf_module", get_attention_modules(ATTENTION_PARAMS_DICT).items()
+    )
+    def test_forward_against_huggingface_outputs(self, module_name, hf_module):
+        hidden_states = torch.randn(2, 3, 6)
+        attention_mask = torch.tensor([[0, 1, 0], [1, 1, 0]])
+
+        attention = AttentionLayer.from_pretrained_module(hf_module)
+
+        torch.manual_seed(1234)
+        output = attention.forward(hidden_states, attention_mask=attention_mask)
+        # We do this because bert, roberta, electra process the attention_mask at the model level.
+        attention_mask_hf = (attention_mask == 0).view((2, 1, 1, 3)).expand(2, 2, 3, 3) * -10e5
+        torch.manual_seed(1234)
+        hf_output = hf_module.forward(hidden_states, attention_mask=attention_mask_hf)
+
+        assert torch.allclose(output[0], hf_output[0])
+
+    @pytest.mark.parametrize(
+        "pretrained_name",
+        [
+            "bert-base-uncased",
+            "roberta-base",
+        ],
+    )
+    def test_loading_from_pretrained_weights_using_model_name(self, pretrained_name):
+
+        torch.manual_seed(1234)
+        pretrained = cached_transformers.get(pretrained_name, False)
+
+        if "distilbert" in pretrained_name:
+            encoder = pretrained.transformer
+        else:
+            encoder = pretrained.encoder
+        # Hacky way to get a bert layer.
+        for i, pretrained_module in enumerate(encoder.layer.modules()):
+            if i == 1:
+                break
+
+        pretrained_module = pretrained_module.attention
+
+        torch.manual_seed(1234)
+        module = AttentionLayer.from_pretrained_module(pretrained_name)
+        mapping = {
+            val: key
+            for key, val in module._construct_default_mapping(
+                pretrained_module, "huggingface", {}
+            ).items()
+        }
+        assert_equal_parameters(pretrained_module, module, mapping=mapping)
+
+        batch_size = 2
+        seq_len = 768
+        dim = module.self.query.in_features
+        hidden_states = torch.randn(batch_size, seq_len, dim)
+        attention_mask = torch.randint(0, 2, (batch_size, seq_len))
+        mask_reshp = (batch_size, 1, 1, dim)
+        attention_mask_hf = (attention_mask == 0).view(mask_reshp).expand(
+            batch_size, 12, seq_len, seq_len
+        ) * -10e5
+
+        # setting to eval mode to avoid non-deterministic dropout.
+        module = module.eval()
+        pretrained_module = pretrained_module.eval()
+
+        torch.manual_seed(1234)
+        output = module.forward(hidden_states, attention_mask=attention_mask.squeeze())[0]
+        torch.manual_seed(1234)
+        hf_output = pretrained_module.forward(hidden_states, attention_mask=attention_mask_hf)[0]
+
+        assert torch.allclose(output, hf_output, atol=1e-04)
+
+
+LAYER_PARAMS_DICT = {
+    "hidden_size": 6,
+    "intermediate_size": 3,
+    "num_attention_heads": 2,
+    "attention_dropout": 0.1,
+    "hidden_dropout": 0.2,
+    "activation": "relu",
+}
+
+
+def get_layer_modules(params_dict):
+    modules = {}
+    params = copy.deepcopy(params_dict)
+    params["attention_probs_dropout_prob"] = params.pop("attention_dropout")
+    params["hidden_dropout_prob"] = params.pop("hidden_dropout")
+
+    # bert, roberta, electra, layoutlm self attentions have the same code.
+
+    torch.manual_seed(1234)
+    hf_module = BertLayer(BertConfig(**params))
+    modules["bert"] = hf_module
+
+    torch.manual_seed(1234)
+    hf_module = RobertaLayer(RobertaConfig(**params))
+    modules["roberta"] = hf_module
+
+    torch.manual_seed(1234)
+    hf_module = ElectraLayer(ElectraConfig(**params))
+    modules["electra"] = hf_module
+
+    return modules
+
+
+class TestTransformerLayer(AllenNlpTestCase):
+    def setup_method(self):
+        super().setup_method()
+
+        self.params_dict = {
+            "hidden_size": 6,
+            "intermediate_size": 3,
+            "num_attention_heads": 2,
+            "attention_dropout": 0.1,
+            "hidden_dropout": 0.2,
+            "activation": "relu",
+        }
+
+        params = Params(copy.deepcopy(self.params_dict))
+
+        self.transformer_layer = TransformerLayer.from_params(params)
+        self.pretrained_name = "bert-base-uncased"
+
+        self.pretrained = cached_transformers.get(self.pretrained_name, False)
+
+    def test_can_construct_from_params(self):
+
+        transformer_layer = self.transformer_layer
+
+        assert (
+            transformer_layer.attention.self.num_attention_heads
+            == self.params_dict["num_attention_heads"]
+        )
+        assert transformer_layer.attention.self.attention_head_size == int(
+            self.params_dict["hidden_size"] / self.params_dict["num_attention_heads"]
+        )
+        assert (
+            transformer_layer.attention.self.all_head_size
+            == self.params_dict["num_attention_heads"]
+            * transformer_layer.attention.self.attention_head_size
+        )
+        assert transformer_layer.attention.self.query.in_features == self.params_dict["hidden_size"]
+        assert transformer_layer.attention.self.key.in_features == self.params_dict["hidden_size"]
+        assert transformer_layer.attention.self.value.in_features == self.params_dict["hidden_size"]
+        assert transformer_layer.attention.self.dropout.p == self.params_dict["attention_dropout"]
+
+        assert (
+            transformer_layer.attention.output.dense.in_features == self.params_dict["hidden_size"]
+        )
+        assert (
+            transformer_layer.attention.output.dense.out_features == self.params_dict["hidden_size"]
+        )
+        assert (
+            transformer_layer.attention.output.layer_norm.normalized_shape[0]
+            == self.params_dict["hidden_size"]
+        )
+        assert transformer_layer.attention.output.dropout.p == self.params_dict["hidden_dropout"]
+
+        assert transformer_layer.intermediate.dense.in_features == self.params_dict["hidden_size"]
+        assert (
+            transformer_layer.intermediate.dense.out_features
+            == self.params_dict["intermediate_size"]
+        )
+
+        assert transformer_layer.output.dense.in_features == self.params_dict["intermediate_size"]
+        assert transformer_layer.output.dense.out_features == self.params_dict["hidden_size"]
+
+        assert (
+            transformer_layer.output.layer_norm.normalized_shape[0]
+            == self.params_dict["hidden_size"]
+        )
+
+        assert transformer_layer.output.dropout.p == self.params_dict["hidden_dropout"]
+
+    def test_forward_runs(self):
+        attention_mask = torch.tensor([[0, 1, 0], [1, 1, 0]])
+        self.transformer_layer.forward(torch.randn(2, 3, 6), attention_mask=attention_mask)
+
+        with pytest.raises(AssertionError):
+            self.transformer_layer.forward(
+                torch.randn(2, 3, 6),
+                attention_mask=attention_mask,
+                encoder_hidden_states=torch.randn(2, 3, 6),
+            )
+
+    def test_cross_attention(self):
+        params = copy.deepcopy(self.params_dict)
+        params["add_cross_attention"] = True
+
+        params = Params(params)
+
+        transformer_layer = TransformerLayer.from_params(params)
+        assert hasattr(transformer_layer, "cross_attention")
+
+        attention_mask = torch.tensor([[0, 1, 0], [1, 1, 0]])
+        transformer_layer.forward(
+            torch.randn(2, 3, 6),
+            attention_mask=attention_mask,
+            encoder_hidden_states=torch.randn(2, 3, 6),
+        )
+
+        transformer_layer_new = TransformerLayer.from_pretrained_module(
+            transformer_layer, source="allennlp"
+        )
+
+        assert hasattr(transformer_layer_new, "cross_attention")
+
+    def test_loading_from_pretrained_weights(self):
+
+        # Hacky way to get a bert layer.
+        for i, pretrained_module in enumerate(self.pretrained.encoder.layer.modules()):
+            if i == 1:
+                break
+
+        module = TransformerLayer.from_pretrained_module(pretrained_module)
+        mapping = {
+            val: key
+            for key, val in module._construct_default_mapping(
+                pretrained_module, "huggingface", {}
+            ).items()
+        }
+        assert_equal_parameters(pretrained_module, module, mapping=mapping)
+
+    @pytest.mark.parametrize("module_name, hf_module", get_layer_modules(LAYER_PARAMS_DICT).items())
+    def test_forward_against_huggingface_outputs(self, module_name, hf_module):
+        hidden_states = torch.randn(2, 3, 6)
+        attention_mask = torch.tensor([[0, 1, 0], [1, 1, 0]])
+
+        layer = TransformerLayer.from_pretrained_module(hf_module)
+
+        torch.manual_seed(1234)
+        output = layer.forward(hidden_states, attention_mask=attention_mask)
+        # We do this because bert, roberta, electra process the attention_mask at the model level.
+        attention_mask_hf = (attention_mask == 0).view((2, 1, 1, 3)).expand(2, 2, 3, 3) * -10e5
+        torch.manual_seed(1234)
+        hf_output = hf_module.forward(hidden_states, attention_mask=attention_mask_hf)
+
+        assert torch.allclose(output[0], hf_output[0])
+
+    @pytest.mark.parametrize(
+        "pretrained_name",
+        [
+            "bert-base-uncased",
+            "roberta-base",
+        ],
+    )
+    def test_loading_from_pretrained_weights_using_model_name(self, pretrained_name):
+
+        torch.manual_seed(1234)
+        pretrained = cached_transformers.get(pretrained_name, False)
+
+        if "distilbert" in pretrained_name:
+            encoder = pretrained.transformer
+        else:
+            encoder = pretrained.encoder
+        # Hacky way to get a bert layer.
+        for i, pretrained_module in enumerate(encoder.layer.modules()):
+            if i == 1:
+                break
+
+        pretrained_module = pretrained_module
+
+        torch.manual_seed(1234)
+        module = TransformerLayer.from_pretrained_module(pretrained_name)
+        mapping = {
+            val: key
+            for key, val in module._construct_default_mapping(
+                pretrained_module, "huggingface", {}
+            ).items()
+        }
+        assert_equal_parameters(pretrained_module, module, mapping=mapping)
+
+        batch_size = 2
+        seq_len = 768
+        dim = module.attention.self.query.in_features
+        hidden_states = torch.randn(batch_size, seq_len, dim)
+        attention_mask = torch.randint(0, 2, (batch_size, seq_len))
+        mask_reshp = (batch_size, 1, 1, dim)
+        attention_mask_hf = (attention_mask == 0).view(mask_reshp).expand(
+            batch_size, 12, seq_len, seq_len
+        ) * -10e5
+
+        # setting to eval mode to avoid non-deterministic dropout.
+        module = module.eval()
+        pretrained_module = pretrained_module.eval()
+
+        torch.manual_seed(1234)
+        output = module.forward(hidden_states, attention_mask=attention_mask.squeeze())[0]
+        torch.manual_seed(1234)
+        hf_output = pretrained_module.forward(hidden_states, attention_mask=attention_mask_hf)[0]
+
+        assert torch.allclose(output, hf_output, atol=1e-04)
diff --git a/tests/modules/transformer/transformer_module_test.py b/tests/modules/transformer/transformer_module_test.py
new file mode 100644
index 00000000000..d5002f215ea
--- /dev/null
+++ b/tests/modules/transformer/transformer_module_test.py
@@ -0,0 +1,74 @@
+import torch
+
+from allennlp.common.testing import assert_equal_parameters
+from allennlp.modules.transformer import TransformerModule
+from allennlp.common.testing import AllenNlpTestCase
+
+
+class TestTransformerModule(AllenNlpTestCase):
+    def test_can_load_pretrained_weights(self):
+        class InternalOld(torch.nn.Module):
+            def __init__(self, inp, out):
+                super().__init__()
+                self.ff = torch.nn.Linear(inp, out)
+
+            def forward(self, x):
+                x = self.ff(x)
+                return x
+
+        class InternalNew(TransformerModule):
+            def __init__(self, inp, out):
+                super().__init__()
+                self.linear = torch.nn.Linear(inp, out)
+
+            def _construct_default_mapping(self, pretrained_module, source, mapping):
+                # return {"linear": "ff"}
+                return {"ff": "linear"}
+
+            def forward(self, x):
+                x = self.linear(x)
+                return x
+
+        class ExternalOld(torch.nn.Module):
+            def __init__(self, inp, out):
+                super().__init__()
+                self.internal = InternalOld(inp, out)
+
+            def forward(self, x):
+                x = self.internal(x)
+                return x
+
+        class External(TransformerModule):
+            # _huggingface_mapping = {"internal_layer": "internal"}
+            _huggingface_mapping = {"internal": "internal_layer"}
+
+            def __init__(self, inp, out):
+                super().__init__()
+                self.internal_layer = InternalNew(inp, out)
+
+            def forward(self, x):
+                x = self.internal_layer(x)
+                return x
+
+        iold = InternalOld(3, 5)
+        x = torch.randn(4, 3)
+        iold.forward(x)
+        inew = InternalNew(3, 5)
+        inew._load_from_pretrained_module(iold)
+        mapping = {
+            val: key
+            for key, val in inew._construct_default_mapping(iold, "huggingface", {}).items()
+        }
+        assert_equal_parameters(iold, inew, mapping=mapping)
+
+        eold = ExternalOld(3, 5)
+        x = torch.randn(4, 3)
+        eold.forward(x)
+
+        enew = External(3, 5)
+        enew._load_from_pretrained_module(eold)
+        mapping = {
+            val: key
+            for key, val in enew._construct_default_mapping(eold, "huggingface", {}).items()
+        }
+        assert_equal_parameters(eold, enew, mapping=mapping)
diff --git a/tests/modules/transformer/transformer_pooler_test.py b/tests/modules/transformer/transformer_pooler_test.py
new file mode 100644
index 00000000000..4fb7842d9ae
--- /dev/null
+++ b/tests/modules/transformer/transformer_pooler_test.py
@@ -0,0 +1,30 @@
+import copy
+import torch
+
+from allennlp.common import Params
+from allennlp.modules.transformer import TransformerPooler
+from allennlp.common.testing import AllenNlpTestCase
+
+
+class TestTransformerPooler(AllenNlpTestCase):
+    def setup_method(self):
+        super().setup_method()
+
+        self.params_dict = {
+            "hidden_size": 5,
+            "intermediate_size": 3,
+        }
+
+        params = Params(copy.deepcopy(self.params_dict))
+
+        self.pooler = TransformerPooler.from_params(params)
+
+    def test_can_construct_from_params(self):
+
+        assert self.pooler.dense.in_features == self.params_dict["hidden_size"]
+        assert self.pooler.dense.out_features == self.params_dict["intermediate_size"]
+
+    def test_forward_runs(self):
+
+        out = self.pooler.forward(torch.randn(2, 7, 5))
+        assert out.size() == (2, 3)
diff --git a/tests/modules/transformer/transformer_stack_test.py b/tests/modules/transformer/transformer_stack_test.py
new file mode 100644
index 00000000000..f9383960822
--- /dev/null
+++ b/tests/modules/transformer/transformer_stack_test.py
@@ -0,0 +1,232 @@
+import copy
+import torch
+import pytest
+
+from allennlp.common import Params
+from allennlp.common import cached_transformers
+
+from allennlp.common.testing import assert_equal_parameters
+from allennlp.modules.transformer import TransformerStack, TransformerLayer
+from allennlp.common.testing import AllenNlpTestCase
+
+from transformers.models.bert.configuration_bert import BertConfig
+from transformers.models.bert.modeling_bert import BertEncoder
+from transformers.models.roberta.configuration_roberta import RobertaConfig
+from transformers.models.roberta.modeling_roberta import RobertaEncoder
+from transformers.models.electra.configuration_electra import ElectraConfig
+from transformers.models.electra.modeling_electra import ElectraEncoder
+
+PARAMS_DICT = {
+    "num_hidden_layers": 3,
+    "hidden_size": 6,
+    "intermediate_size": 3,
+    "num_attention_heads": 2,
+    "attention_dropout": 0.1,
+    "hidden_dropout": 0.2,
+    "activation": "relu",
+}
+
+
+def get_modules(params_dict):
+    modules = {}
+    params = copy.deepcopy(params_dict)
+    params["attention_probs_dropout_prob"] = params.pop("attention_dropout")
+    params["hidden_dropout_prob"] = params.pop("hidden_dropout")
+
+    torch.manual_seed(1234)
+    hf_module = BertEncoder(BertConfig(**params))
+    modules["bert"] = hf_module
+
+    torch.manual_seed(1234)
+    hf_module = RobertaEncoder(RobertaConfig(**params))
+    modules["roberta"] = hf_module
+
+    torch.manual_seed(1234)
+    hf_module = ElectraEncoder(ElectraConfig(**params))
+    modules["electra"] = hf_module
+
+    return modules
+
+
+class TestTransformerStack(AllenNlpTestCase):
+    def setup_method(self):
+        super().setup_method()
+
+        self.params_dict = {
+            "num_hidden_layers": 3,
+            "hidden_size": 6,
+            "intermediate_size": 3,
+            "num_attention_heads": 2,
+            "attention_dropout": 0.1,
+            "hidden_dropout": 0.2,
+            "activation": "relu",
+        }
+
+        params = Params(copy.deepcopy(self.params_dict))
+
+        self.transformer_stack = TransformerStack.from_params(params)
+
+        self.pretrained_name = "bert-base-uncased"
+
+        self.pretrained = cached_transformers.get(self.pretrained_name, False)
+
+    def test_can_construct_from_params(self):
+
+        modules = dict(self.transformer_stack.named_modules())
+        assert len(modules["layers"]) == self.params_dict["num_hidden_layers"]
+
+    def test_forward_runs(self):
+        self.transformer_stack.forward(torch.randn(2, 3, 6), attention_mask=torch.randn(2, 3))
+
+        with pytest.raises(AssertionError):
+            self.transformer_stack.forward(
+                torch.randn(2, 3, 6),
+                attention_mask=torch.randn(2, 3),
+                encoder_hidden_states=torch.randn(2, 3, 6),
+            )
+
+    def test_layer_same_as_params(self):
+        params = copy.deepcopy(self.params_dict)
+        num_hidden_layers = params.pop("num_hidden_layers")
+        # params = Params(params)
+
+        torch.manual_seed(1234)
+        transformer_layer = TransformerLayer(**params)
+        transformer_stack_from_layer = TransformerStack(num_hidden_layers, transformer_layer)
+        torch.manual_seed(1234)
+        transformer_stack_from_params = TransformerStack(num_hidden_layers, **params)
+
+        hidden_states = torch.randn(2, 3, 6)
+        attention_mask = torch.tensor([[0, 1, 0], [1, 1, 0]])
+
+        transformer_stack_from_layer.eval()
+        transformer_stack_from_params.eval()
+
+        torch.manual_seed(1234)
+        layer_output = transformer_stack_from_layer.forward(
+            hidden_states, attention_mask=attention_mask
+        )
+
+        torch.manual_seed(1234)
+        params_output = transformer_stack_from_params.forward(
+            hidden_states, attention_mask=attention_mask
+        )
+
+        assert torch.allclose(layer_output[0], params_output[0])
+
+    def test_cross_attention(self):
+        params = copy.deepcopy(self.params_dict)
+        params["add_cross_attention"] = True
+
+        params = Params(params)
+
+        transformer_stack = TransformerStack.from_params(params)
+        modules = dict(transformer_stack.named_modules())
+
+        assert hasattr(modules["layers.0"], "cross_attention")
+
+        attention_mask = torch.tensor([[0, 1, 0], [1, 1, 0]])
+        transformer_stack.forward(
+            torch.randn(2, 3, 6),
+            attention_mask=attention_mask,
+            encoder_hidden_states=torch.randn(2, 3, 6),
+        )
+
+        transformer_stack_new = TransformerStack.from_pretrained_module(
+            transformer_stack, source="allennlp"
+        )
+
+        new_modules = dict(transformer_stack_new.named_modules())
+        assert hasattr(new_modules["layers.0"], "cross_attention")
+
+    def test_loading_from_pretrained_weights(self):
+        pretrained_module = self.pretrained.encoder
+        module = TransformerStack.from_pretrained_module(pretrained_module)
+        mapping = {
+            val: key
+            for key, val in module._construct_default_mapping(
+                pretrained_module, "huggingface", {}
+            ).items()
+        }
+        assert_equal_parameters(pretrained_module, module, mapping)
+
+    def test_loading_partial_pretrained_weights(self):
+
+        kwargs = TransformerStack._get_input_arguments(self.pretrained.encoder)
+        # The pretrained module has 12 bert layers, while the instance will have only 3.
+        kwargs["num_hidden_layers"] = 3
+        transformer_stack = TransformerStack(**kwargs)
+        transformer_stack._load_from_pretrained_module(self.pretrained.encoder)
+        mapping = {
+            val: key
+            for key, val in transformer_stack._construct_default_mapping(
+                self.pretrained.encoder, "huggingface", {}
+            ).items()
+        }
+        assert_equal_parameters(
+            self.pretrained.encoder,
+            transformer_stack,
+            mapping,
+        )
+
+    @pytest.mark.parametrize("module_name, hf_module", get_modules(PARAMS_DICT).items())
+    def test_forward_against_huggingface_outputs(self, module_name, hf_module):
+        hidden_states = torch.randn(2, 3, 6)
+        attention_mask = torch.tensor([[0, 1, 0], [1, 1, 0]])
+
+        stack = TransformerStack.from_pretrained_module(hf_module)
+
+        torch.manual_seed(1234)
+        output = stack.forward(hidden_states, attention_mask=attention_mask)
+        # We do this because bert, roberta, electra process the attention_mask at the model level.
+        attention_mask_hf = (attention_mask == 0).view((2, 1, 1, 3)).expand(2, 2, 3, 3) * -10e5
+        torch.manual_seed(1234)
+        hf_output = hf_module.forward(hidden_states, attention_mask=attention_mask_hf)
+
+        assert torch.allclose(output[0], hf_output[0])
+
+    @pytest.mark.parametrize(
+        "pretrained_name",
+        [
+            "bert-base-uncased",
+        ],
+    )
+    def test_loading_from_pretrained_weights_using_model_name(self, pretrained_name):
+
+        torch.manual_seed(1234)
+        pretrained = cached_transformers.get(pretrained_name, False)
+
+        if "distilbert" in pretrained_name:
+            pretrained_module = pretrained.transformer
+        else:
+            pretrained_module = pretrained.encoder
+
+        torch.manual_seed(1234)
+        module = TransformerStack.from_pretrained_module(pretrained_name)
+        mapping = {
+            val: key
+            for key, val in module._construct_default_mapping(
+                pretrained_module, "huggingface", {}
+            ).items()
+        }
+        assert_equal_parameters(pretrained_module, module, mapping=mapping)
+
+        batch_size = 1
+        seq_len = 768
+        dim = dict(module.named_modules())["layers.0.attention.self.query"].in_features
+        hidden_states = torch.randn(batch_size, seq_len, dim)
+        attention_mask = torch.randint(0, 2, (batch_size, seq_len))
+        mask_reshp = (batch_size, 1, 1, dim)
+        attention_mask_hf = (attention_mask == 0).view(mask_reshp)
+        attention_mask_hf = attention_mask_hf.expand(batch_size, 12, seq_len, seq_len) * -10e5
+
+        # setting to eval mode to avoid non-deterministic dropout.
+        module = module.eval()
+        pretrained_module = pretrained_module.eval()
+
+        torch.manual_seed(1234)
+        output = module.forward(hidden_states, attention_mask=attention_mask.squeeze())[0]
+        torch.manual_seed(1234)
+        hf_output = pretrained_module.forward(hidden_states, attention_mask=attention_mask_hf)[0]
+
+        assert torch.allclose(output, hf_output)
diff --git a/tests/modules/vision/__init__.py b/tests/modules/vision/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/tests/modules/vision/grid_embedder_test.py b/tests/modules/vision/grid_embedder_test.py
new file mode 100644
index 00000000000..9fddfb27727
--- /dev/null
+++ b/tests/modules/vision/grid_embedder_test.py
@@ -0,0 +1,16 @@
+from allennlp.common.testing import AllenNlpTestCase, requires_gpu
+from allennlp.data.image_loader import TorchImageLoader
+from allennlp.modules.vision.grid_embedder import ResnetBackbone
+
+
+class TestResnetBackbone(AllenNlpTestCase):
+    @requires_gpu
+    def test_forward_runs(self):
+        loader = TorchImageLoader(device="cuda:0")
+        backbone = ResnetBackbone().to("cuda:0")
+
+        image_pixels, image_size = loader(
+            [self.FIXTURES_ROOT / "data" / "images" / "COCO_train2014_000000458752.jpg"]
+        )
+        result = backbone(image_pixels, image_size)
+        assert tuple(result.keys()) == backbone.get_feature_names()
diff --git a/tests/modules/vision/region_detector_test.py b/tests/modules/vision/region_detector_test.py
new file mode 100644
index 00000000000..a8608fbddff
--- /dev/null
+++ b/tests/modules/vision/region_detector_test.py
@@ -0,0 +1,52 @@
+import torchvision
+
+from allennlp.common.testing import AllenNlpTestCase, requires_gpu
+from allennlp.data.image_loader import TorchImageLoader
+from allennlp.modules.vision.grid_embedder import ResnetBackbone
+from allennlp.modules.vision.region_detector import FasterRcnnRegionDetector
+
+
+class TestFasterRcnnRegionDetector(AllenNlpTestCase):
+    @requires_gpu
+    def test_forward_runs(self):
+        loader = TorchImageLoader(resize=True, normalize=True, device="cuda:0")
+        backbone = ResnetBackbone().to(device="cuda:0")
+        backbone.eval()
+        detector = FasterRcnnRegionDetector().to(device="cuda:0")
+        detector.eval()
+
+        image_path = self.FIXTURES_ROOT / "data" / "images" / "COCO_train2014_000000458752.jpg"
+
+        images, sizes = loader([image_path, image_path])
+        image_features = backbone(images, sizes)
+        del backbone
+        detections = detector(images, sizes, image_features)
+        del detector
+
+        assert len(detections.features) == 2
+        assert len(detections.boxes) == 2
+        assert len(detections.class_probs) == 2
+        assert len(detections.class_labels) == 2
+
+        assert detections.features[0].shape[0] >= 1
+        assert detections.features[0].shape[1] == 1024
+        assert (
+            detections.features[0].shape[0]
+            == detections.boxes[0].shape[0]
+            == detections.class_probs[0].shape[0]
+            == detections.class_labels[0].shape[0]
+        )
+
+        # Okay, cool, so far so good. Now let's make sure the output we got
+        # actually matches exactly what we would get using the full pipeline
+        # directly from torchvision.
+        raw_loader = TorchImageLoader(resize=False, normalize=False, device="cuda:0")
+        image, _ = raw_loader(image_path)
+        model = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained=True).to("cuda:0")
+        model.eval()
+        result = model([image, image])
+        # We can't compare the boxes directly because the boxes here are post-processed
+        # back to reference the original un-resized image. But we can compare
+        # the labels and scores. They should match exactly.
+        assert (result[0]["labels"] == detections.class_labels[0]).all()
+        assert (result[0]["scores"] == detections.class_probs[0]).all()
diff --git a/tests/nn/util_test.py b/tests/nn/util_test.py
index 705f3f7ab74..d98439534ff 100644
--- a/tests/nn/util_test.py
+++ b/tests/nn/util_test.py
@@ -1427,25 +1427,6 @@ def test_combine_tensors_and_multiply_with_batch_size_one_and_seq_len_one(self):
 
         assert_almost_equal(result.size(), [1, seq_len_1, seq_len_2])
 
-    def test_has_tensor(self):
-
-        has_tensor = util.has_tensor
-        tensor = torch.tensor([1, 2, 3])
-
-        assert has_tensor(["a", 10, tensor])
-        assert not has_tensor(["a", 10])
-
-        assert has_tensor(("a", 10, tensor))
-        assert not has_tensor(("a", 10))
-
-        assert has_tensor({"a": tensor, "b": 1})
-        assert not has_tensor({"a": 10, "b": 1})
-
-        assert has_tensor(tensor)
-        assert not has_tensor(3)
-
-        assert has_tensor({"x": [0, {"inside": {"double_inside": [3, [10, tensor]]}}]})
-
     def test_combine_initial_dims(self):
         tensor = torch.randn(4, 10, 20, 17, 5)
 
@@ -1471,13 +1452,13 @@ def test_inspect_model_parameters(self):
         assert parameters_inspection_dict == util.inspect_parameters(model)
 
     def test_move_to_device(self):
-        # We're faking the tensor here so that we can test the calls to .cuda() without actually
+        # We're faking the tensor here so that we can test the calls to .to() without actually
         # needing a GPU.
         class FakeTensor(torch.Tensor):
             def __init__(self):
                 self._device = None
 
-            def cuda(self, device):
+            def to(self, device, **kwargs):
                 self._device = device
                 return self
 
diff --git a/tests/predictors/predictor_test.py b/tests/predictors/predictor_test.py
index 1c04e4255af..33969b92130 100644
--- a/tests/predictors/predictor_test.py
+++ b/tests/predictors/predictor_test.py
@@ -45,6 +45,7 @@ def test_get_gradients(self):
         predictor = Predictor.from_archive(archive)
 
         instance = predictor._json_to_instance(inputs)
+        predictor._dataset_reader.apply_token_indexers(instance)
         outputs = predictor._model.forward_on_instance(instance)
         labeled_instances = predictor.predictions_to_labeled_instances(instance, outputs)
         for instance in labeled_instances:
@@ -70,6 +71,7 @@ def test_get_gradients_when_requires_grad_is_false(self):
         embedding_layer = util.find_embedding_layer(predictor._model)
         assert not embedding_layer.weight.requires_grad
         instance = predictor._json_to_instance(inputs)
+        predictor._dataset_reader.apply_token_indexers(instance)
         outputs = predictor._model.forward_on_instance(instance)
         labeled_instances = predictor.predictions_to_labeled_instances(instance, outputs)
         # ensure that gradients are always present, despite requires_grad being false on the embedding layer
diff --git a/tests/predictors/sentence_tagger_test.py b/tests/predictors/sentence_tagger_test.py
index f6a36270178..9741e58864c 100644
--- a/tests/predictors/sentence_tagger_test.py
+++ b/tests/predictors/sentence_tagger_test.py
@@ -13,6 +13,7 @@ def test_predictions_to_labeled_instances(self):
         predictor = Predictor.from_archive(archive, "sentence_tagger")
 
         instance = predictor._json_to_instance(inputs)
+        predictor._dataset_reader.apply_token_indexers(instance)
         outputs = predictor._model.forward_on_instance(instance)
         new_instances = predictor.predictions_to_labeled_instances(instance, outputs)
         assert len(new_instances) > 1
diff --git a/tests/predictors/text_classifier_test.py b/tests/predictors/text_classifier_test.py
index 852d539ccf0..ad0a2d7be02 100644
--- a/tests/predictors/text_classifier_test.py
+++ b/tests/predictors/text_classifier_test.py
@@ -90,6 +90,7 @@ def test_predictions_to_labeled_instances(self):
         predictor = Predictor.from_archive(archive, "text_classifier")
 
         instance = predictor._json_to_instance(inputs)
+        predictor._dataset_reader.apply_token_indexers(instance)
         outputs = predictor._model.forward_on_instance(instance)
         new_instances = predictor.predictions_to_labeled_instances(instance, outputs)
         assert "label" in new_instances[0].fields
diff --git a/tests/training/learning_rate_schedulers/slanted_triangular_test.py b/tests/training/learning_rate_schedulers/slanted_triangular_test.py
index fadd7582186..5280970a34a 100644
--- a/tests/training/learning_rate_schedulers/slanted_triangular_test.py
+++ b/tests/training/learning_rate_schedulers/slanted_triangular_test.py
@@ -5,11 +5,10 @@
 import torch
 import pytest
 
-from allennlp.data.dataset_readers.dataset_reader import AllennlpDataset
 from allennlp.common import Lazy, Params
 from allennlp.common.checks import ConfigurationError
 from allennlp.common.testing import AllenNlpTestCase
-from allennlp.data import PyTorchDataLoader
+from allennlp.data.data_loaders import SimpleDataLoader
 from allennlp.training import Trainer
 from allennlp.training.learning_rate_schedulers import LearningRateScheduler, SlantedTriangular
 from allennlp.training.optimizers import Optimizer
@@ -114,14 +113,14 @@ def test_from_params_in_trainer(self):
         )
         # The method called in the logic below only checks the length of this list, not its
         # contents, so this should be safe.
-        instances = AllennlpDataset([1] * 40)
+        instances = [1] * 40
         optim = self._get_optimizer()
         trainer = Trainer.from_params(
             model=self.model,
             optimizer=Lazy(lambda **kwargs: optim),
             serialization_dir=self.TEST_DIR,
             params=params,
-            data_loader=PyTorchDataLoader(instances, batch_size=10),
+            data_loader=SimpleDataLoader(instances, batch_size=10),
         )
         assert isinstance(trainer._learning_rate_scheduler, SlantedTriangular)
 
@@ -151,7 +150,7 @@ def test_from_params_in_trainer(self):
             optimizer=Lazy(lambda **kwargs: optim),
             serialization_dir=self.TEST_DIR,
             params=params,
-            data_loader=PyTorchDataLoader(instances, batch_size=10),
+            data_loader=SimpleDataLoader(instances, batch_size=10),
         )
         assert trainer._learning_rate_scheduler.num_epochs == 3
 
diff --git a/tests/training/optimizer_test.py b/tests/training/optimizer_test.py
index ee11168b66a..b396cdcd4cc 100644
--- a/tests/training/optimizer_test.py
+++ b/tests/training/optimizer_test.py
@@ -2,7 +2,7 @@
 from allennlp.common.testing import AllenNlpTestCase
 from allennlp.data import Vocabulary
 from allennlp.data.dataset_readers import SequenceTaggingDatasetReader
-from allennlp.data import PyTorchDataLoader
+from allennlp.data.data_loaders import SimpleDataLoader
 from allennlp.models.simple_tagger import SimpleTagger
 from allennlp.training import GradientDescentTrainer
 from allennlp.training.optimizers import Optimizer
@@ -80,8 +80,10 @@ def test_optimizer_parameter_groups(self):
 class TestDenseSparseAdam(AllenNlpTestCase):
     def setup_method(self):
         super().setup_method()
-        self.instances = SequenceTaggingDatasetReader().read(
-            self.FIXTURES_ROOT / "data" / "sequence_tagging.tsv"
+        self.instances = list(
+            SequenceTaggingDatasetReader().read(
+                self.FIXTURES_ROOT / "data" / "sequence_tagging.tsv"
+            )
         )
         self.vocab = Vocabulary.from_instances(self.instances)
         self.model_params = Params(
@@ -100,5 +102,6 @@ def test_can_optimise_model_with_dense_and_sparse_params(self):
         optimizer_params = Params({"type": "dense_sparse_adam"})
         parameters = [[n, p] for n, p in self.model.named_parameters() if p.requires_grad]
         optimizer = Optimizer.from_params(model_parameters=parameters, params=optimizer_params)
-        self.instances.index_with(self.vocab)
-        GradientDescentTrainer(self.model, optimizer, PyTorchDataLoader(self.instances, 2)).train()
+        for instance in self.instances:
+            instance.index_fields(self.vocab)
+        GradientDescentTrainer(self.model, optimizer, SimpleDataLoader(self.instances, 2)).train()
diff --git a/tests/training/trainer_test.py b/tests/training/trainer_test.py
index 1a2b00b83be..fbf93461652 100644
--- a/tests/training/trainer_test.py
+++ b/tests/training/trainer_test.py
@@ -10,15 +10,13 @@
 import pytest
 
 import torch
-from torch.utils.data import DataLoader
 from torch.nn.utils import clip_grad_norm_
-from allennlp.data.dataloader import PyTorchDataLoader
 
 from allennlp.common.checks import ConfigurationError
 from allennlp.common.params import Params
 from allennlp.common.testing import AllenNlpTestCase, requires_gpu, requires_multi_gpu
 from allennlp.data import Vocabulary
-from allennlp.data.dataloader import TensorDict
+from allennlp.data.data_loaders import MultiProcessDataLoader, SimpleDataLoader, TensorDict
 from allennlp.data.dataset_readers import SequenceTaggingDatasetReader
 from allennlp.models.model import Model
 from allennlp.models.simple_tagger import SimpleTagger
@@ -26,8 +24,6 @@
     GradientDescentTrainer,
     Checkpointer,
     TensorboardWriter,
-    BatchCallback,
-    EpochCallback,
     TrainerCallback,
     TrackEpochCallback,
 )
@@ -35,20 +31,21 @@
 from allennlp.training.learning_rate_schedulers import ExponentialLearningRateScheduler
 from allennlp.training.momentum_schedulers import MomentumScheduler
 from allennlp.training.moving_average import ExponentialMovingAverage
-from allennlp.data import allennlp_collate
 
 
 class TrainerTestBase(AllenNlpTestCase):
     def setup_method(self):
         super().setup_method()
-        self.instances = SequenceTaggingDatasetReader().read(
-            self.FIXTURES_ROOT / "data" / "sequence_tagging.tsv"
+        self.data_path = str(self.FIXTURES_ROOT / "data" / "sequence_tagging.tsv")
+        self.reader = SequenceTaggingDatasetReader()
+        self.data_loader = MultiProcessDataLoader(self.reader, self.data_path, batch_size=2)
+        self.data_loader_lazy = MultiProcessDataLoader(
+            self.reader, self.data_path, batch_size=2, max_instances_in_memory=10
         )
-        self.instances_lazy = SequenceTaggingDatasetReader(lazy=True).read(
-            self.FIXTURES_ROOT / "data" / "sequence_tagging.tsv"
-        )
-        vocab = Vocabulary.from_instances(self.instances)
-        self.vocab = vocab
+        self.instances = list(self.data_loader.iter_instances())
+        self.vocab = Vocabulary.from_instances(self.instances)
+        self.data_loader.index_with(self.vocab)
+        self.data_loader_lazy.index_with(self.vocab)
         self.model_params = Params(
             {
                 "text_field_embedder": {
@@ -59,15 +56,10 @@ def setup_method(self):
         )
         self.model = SimpleTagger.from_params(vocab=self.vocab, params=self.model_params)
         self.optimizer = torch.optim.SGD(self.model.parameters(), 0.01, momentum=0.9)
-        self.data_loader = DataLoader(self.instances, batch_size=2, collate_fn=allennlp_collate)
-        self.data_loader_lazy = DataLoader(
-            self.instances_lazy, batch_size=2, collate_fn=allennlp_collate
-        )
-        self.validation_data_loader = DataLoader(
-            self.instances, batch_size=2, collate_fn=allennlp_collate
+        self.validation_data_loader = MultiProcessDataLoader(
+            self.reader, self.data_path, batch_size=2
         )
-        self.instances.index_with(vocab)
-        self.instances_lazy.index_with(vocab)
+        self.validation_data_loader.index_with(self.vocab)
 
 
 class TestTrainer(TrainerTestBase):
@@ -166,18 +158,12 @@ def test_data_loader_lazy_epoch_size_correct(self):
         assert trainer._batch_num_total == num_epochs * 2
 
     def test_data_loader_lazy_epoch_size_correct_custom_epoch_size(self):
-        batches_per_epoch = 3
+        self.data_loader_lazy.batches_per_epoch = 3
         num_epochs = 3
-        data_loader_custom_epoch_lazy = PyTorchDataLoader(
-            self.instances_lazy,
-            batch_size=2,
-            collate_fn=allennlp_collate,
-            batches_per_epoch=batches_per_epoch,
-        )
         trainer = GradientDescentTrainer(
             self.model,
             self.optimizer,
-            data_loader_custom_epoch_lazy,
+            self.data_loader_lazy,
             validation_data_loader=self.validation_data_loader,
             num_epochs=num_epochs,
             serialization_dir=self.TEST_DIR,
@@ -186,15 +172,14 @@ def test_data_loader_lazy_epoch_size_correct_custom_epoch_size(self):
         metrics = trainer.train()
         epoch = metrics["epoch"]
         assert epoch == num_epochs - 1
-        assert trainer._batch_num_total == num_epochs * batches_per_epoch
+        assert trainer._batch_num_total == num_epochs * 3
 
     def test_trainer_respects_epoch_size_equals_total(self):
         batches_per_epoch = 4
         num_epochs = 3
-        data_loader_equal_epoch = PyTorchDataLoader(
+        data_loader_equal_epoch = SimpleDataLoader(
             self.instances,
-            batch_size=2,
-            collate_fn=allennlp_collate,
+            2,
             batches_per_epoch=batches_per_epoch,
         )
         trainer = GradientDescentTrainer(
@@ -214,10 +199,9 @@ def test_trainer_respects_epoch_size_equals_total(self):
     def test_trainer_respects_epoch_size_larger_tnan_total(self):
         batches_per_epoch = 7
         num_epochs = 3
-        data_loader_larger_epoch = PyTorchDataLoader(
+        data_loader_larger_epoch = SimpleDataLoader(
             self.instances,
-            batch_size=2,
-            collate_fn=allennlp_collate,
+            2,
             batches_per_epoch=batches_per_epoch,
         )
         trainer = GradientDescentTrainer(
@@ -237,10 +221,9 @@ def test_trainer_respects_epoch_size_larger_tnan_total(self):
     def test_trainer_respects_epoch_size_smaller_tnan_total(self):
         batches_per_epoch = 1
         num_epochs = 2
-        data_loader_smaller_epoch = PyTorchDataLoader(
+        data_loader_smaller_epoch = SimpleDataLoader(
             self.instances,
-            batch_size=2,
-            collate_fn=allennlp_collate,
+            2,
             batches_per_epoch=batches_per_epoch,
         )
         trainer = GradientDescentTrainer(
@@ -330,28 +313,31 @@ def test_metric_only_considered_best_so_far_when_strictly_better_than_those_befo
             num_epochs=3,
             serialization_dir=self.TEST_DIR,
             patience=5,
-            validation_metric="+test",
+            validation_metric="+acc",
         )
         tracker = new_trainer._metric_tracker
 
         # when it is the only metric it should be considered the best
         new_tracker = copy.deepcopy(tracker)
-        new_tracker.add_metric(1)
+        new_tracker.add_metrics({"acc": 1})
         assert new_tracker.is_best_so_far()
 
         # when it is the same as one before it it is not considered the best
         new_tracker = copy.deepcopy(tracker)
-        new_tracker.add_metrics([0.3, 0.3, 0.3, 0.2, 0.5, 0.1, 0.3])
+        for acc in [0.3, 0.3, 0.3, 0.2, 0.5, 0.1, 0.3]:
+            new_tracker.add_metrics({"acc": acc})
         assert not new_tracker.is_best_so_far()
 
         # when it is the best it is considered the best
         new_tracker = copy.deepcopy(tracker)
-        new_tracker.add_metrics([0.3, 0.3, 0.3, 0.2, 0.5, 0.1, 13])
+        for acc in [0.3, 0.3, 0.3, 0.2, 0.5, 0.1, 13]:
+            new_tracker.add_metrics({"acc": acc})
         assert new_tracker.is_best_so_far()
 
         # when it is not the the best it is not considered the best
         new_tracker = copy.deepcopy(tracker)
-        new_tracker.add_metrics([0.3, 0.3, 0.3, 0.2, 0.5, 0.1, 0.0013])
+        for acc in [0.3, 0.3, 0.3, 0.2, 0.5, 0.1, 0.0013]:
+            new_tracker.add_metrics({"acc": acc})
         assert not new_tracker.is_best_so_far()
 
     def test_metric_only_considered_best_so_far_when_strictly_better_than_those_before_it_decreasing_metric(
@@ -365,28 +351,31 @@ def test_metric_only_considered_best_so_far_when_strictly_better_than_those_befo
             num_epochs=3,
             serialization_dir=self.TEST_DIR,
             patience=5,
-            validation_metric="-test",
+            validation_metric="-acc",
         )
         tracker = new_trainer._metric_tracker
 
         # when it is the only metric it should be considered the best
         new_tracker = copy.deepcopy(tracker)
-        new_tracker.add_metric(1)
+        new_tracker.add_metrics({"acc": 1})
         assert new_tracker.is_best_so_far()
 
         # when it is the same as one before it it is not considered the best
         new_tracker = copy.deepcopy(tracker)
-        new_tracker.add_metrics([0.3, 0.3, 0.3, 0.2, 0.5, 0.1, 0.3])
+        for acc in [0.3, 0.3, 0.3, 0.2, 0.5, 0.1, 0.3]:
+            new_tracker.add_metrics({"acc": acc})
         assert not new_tracker.is_best_so_far()
 
         # when it is the best it is considered the best
         new_tracker = copy.deepcopy(tracker)
-        new_tracker.add_metrics([0.3, 0.3, 0.3, 0.2, 0.5, 0.1, 0.0013])
+        for acc in [0.3, 0.3, 0.3, 0.2, 0.5, 0.1, 0.0013]:
+            new_tracker.add_metrics({"acc": acc})
         assert new_tracker.is_best_so_far()
 
         # when it is not the the best it is not considered the best
         new_tracker = copy.deepcopy(tracker)
-        new_tracker.add_metrics([0.3, 0.3, 0.3, 0.2, 0.5, 0.1, 13])
+        for acc in [0.3, 0.3, 0.3, 0.2, 0.5, 0.1, 13]:
+            new_tracker.add_metrics({"acc": acc})
 
     def test_should_stop_early_with_increasing_metric(self):
         new_trainer = GradientDescentTrainer(
@@ -397,21 +386,23 @@ def test_should_stop_early_with_increasing_metric(self):
             num_epochs=3,
             serialization_dir=self.TEST_DIR,
             patience=5,
-            validation_metric="+test",
+            validation_metric="+acc",
         )
 
         tracker = new_trainer._metric_tracker
 
         new_tracker = copy.deepcopy(tracker)
-        new_tracker.add_metrics([0.5, 0.3, 0.2, 0.1, 0.4, 0.4])
+        for acc in [0.5, 0.3, 0.2, 0.1, 0.4, 0.4]:
+            new_tracker.add_metrics({"acc": acc})
         assert new_tracker.should_stop_early()
 
         new_tracker = copy.deepcopy(tracker)
-        new_tracker.add_metrics([0.3, 0.3, 0.3, 0.2, 0.5, 0.1])
+        for acc in [0.3, 0.3, 0.3, 0.2, 0.5, 0.1]:
+            new_tracker.add_metrics({"acc": acc})
         assert not new_tracker.should_stop_early()
 
     def test_should_stop_early_with_flat_lining_metric(self):
-        flatline = [0.2] * 6
+        flatline = [{"acc": 0.2}] * 6
         tracker = GradientDescentTrainer(
             self.model,
             self.optimizer,
@@ -420,9 +411,10 @@ def test_should_stop_early_with_flat_lining_metric(self):
             num_epochs=3,
             serialization_dir=self.TEST_DIR,
             patience=5,
-            validation_metric="+test",
+            validation_metric="+acc",
         )._metric_tracker
-        tracker.add_metrics(flatline)
+        for m in flatline:
+            tracker.add_metrics(m)
         assert tracker.should_stop_early
 
         tracker = GradientDescentTrainer(
@@ -433,9 +425,10 @@ def test_should_stop_early_with_flat_lining_metric(self):
             num_epochs=3,
             serialization_dir=self.TEST_DIR,
             patience=5,
-            validation_metric="-test",
+            validation_metric="-acc",
         )._metric_tracker
-        tracker.add_metrics(flatline)
+        for m in flatline:
+            tracker.add_metrics(m)
         assert tracker.should_stop_early
 
     def test_should_stop_early_with_decreasing_metric(self):
@@ -447,20 +440,23 @@ def test_should_stop_early_with_decreasing_metric(self):
             num_epochs=3,
             serialization_dir=self.TEST_DIR,
             patience=5,
-            validation_metric="-test",
+            validation_metric="-acc",
         )
         tracker = new_trainer._metric_tracker
 
         new_tracker = copy.deepcopy(tracker)
-        new_tracker.add_metrics([0.02, 0.3, 0.2, 0.1, 0.4, 0.4])
+        for acc in [0.02, 0.3, 0.2, 0.1, 0.4, 0.4]:
+            new_tracker.add_metrics({"acc": acc})
         assert new_tracker.should_stop_early()
 
         new_tracker = copy.deepcopy(tracker)
-        new_tracker.add_metrics([0.3, 0.3, 0.2, 0.1, 0.4, 0.5])
+        for acc in [0.3, 0.3, 0.2, 0.1, 0.4, 0.5]:
+            new_tracker.add_metrics({"acc": acc})
         assert not new_tracker.should_stop_early()
 
         new_tracker = copy.deepcopy(tracker)
-        new_tracker.add_metrics([0.1, 0.3, 0.2, 0.1, 0.4, 0.5])
+        for acc in [0.1, 0.3, 0.2, 0.1, 0.4, 0.5]:
+            new_tracker.add_metrics({"acc": acc})
         assert new_tracker.should_stop_early()
 
     def test_should_stop_early_with_early_stopping_disabled(self):
@@ -472,10 +468,11 @@ def test_should_stop_early_with_early_stopping_disabled(self):
             validation_data_loader=self.validation_data_loader,
             num_epochs=100,
             patience=None,
-            validation_metric="+test",
+            validation_metric="+acc",
         )
         tracker = trainer._metric_tracker
-        tracker.add_metrics([float(i) for i in reversed(range(20))])
+        for m in [{"acc": float(i)} for i in reversed(range(20))]:
+            tracker.add_metrics(m)
         assert not tracker.should_stop_early()
 
         # Decreasing metric
@@ -486,10 +483,11 @@ def test_should_stop_early_with_early_stopping_disabled(self):
             validation_data_loader=self.validation_data_loader,
             num_epochs=100,
             patience=None,
-            validation_metric="-test",
+            validation_metric="-acc",
         )
         tracker = trainer._metric_tracker
-        tracker.add_metrics([float(i) for i in range(20)])
+        for m in [{"acc": float(i)} for i in range(20)]:
+            tracker.add_metrics(m)
         assert not tracker.should_stop_early()
 
     def test_should_stop_early_with_invalid_patience(self):
@@ -507,7 +505,7 @@ def test_should_stop_early_with_invalid_patience(self):
                     validation_data_loader=self.validation_data_loader,
                     num_epochs=100,
                     patience=patience,
-                    validation_metric="+test",
+                    validation_metric="+acc",
                 )
 
     def test_trainer_can_run_and_resume_with_momentum_scheduler(self):
@@ -669,7 +667,7 @@ def test_trainer_respects_keep_serialized_model_every_num_seconds(self):
         #       2, 4, plus the last two at 5 and 6.
 
         class SlowDataLoader:
-            data_loader = DataLoader(self.instances, batch_size=2, collate_fn=allennlp_collate)
+            data_loader = SimpleDataLoader(self.instances, batch_size=2)
 
             def __iter__(self):
                 time.sleep(2.5)
@@ -678,6 +676,9 @@ def __iter__(self):
             def __len__(self):
                 return len(self.data_loader)
 
+            def set_target_device(self, _):
+                pass
+
         trainer = GradientDescentTrainer(
             self.model,
             self.optimizer,
@@ -700,7 +701,7 @@ def __len__(self):
             assert sorted(epochs) == [1, 3, 4, 5]
 
     def test_trainer_can_log_learning_rates_tensorboard(self):
-        data_loader = DataLoader(self.instances, batch_size=4, collate_fn=allennlp_collate)
+        data_loader = SimpleDataLoader(self.instances, 4)
         trainer = GradientDescentTrainer(
             self.model,
             self.optimizer,
@@ -717,7 +718,7 @@ def test_trainer_can_log_learning_rates_tensorboard(self):
         trainer.train()
 
     def test_trainer_saves_models_at_specified_interval(self):
-        data_loader = DataLoader(self.instances, batch_size=4, collate_fn=allennlp_collate)
+        data_loader = SimpleDataLoader(self.instances, 4)
 
         trainer = GradientDescentTrainer(
             self.model,
@@ -882,37 +883,6 @@ def test_restored_training_returns_best_epoch_metrics_even_if_no_better_epoch_is
         assert training_metrics["best_epoch"] == 0
         assert training_metrics["validation_loss"] > restored_metrics["validation_loss"]
 
-    def test_restoring_works_with_older_checkpointing(self):
-        trainer = GradientDescentTrainer(
-            self.model,
-            self.optimizer,
-            self.data_loader,
-            validation_data_loader=self.validation_data_loader,
-            num_epochs=3,
-            serialization_dir=self.TEST_DIR,
-            checkpointer=Checkpointer(
-                serialization_dir=self.TEST_DIR, num_serialized_models_to_keep=4
-            ),
-        )
-        trainer.train()
-
-        for index in range(3):
-            path = str(self.TEST_DIR / "training_state_epoch_{}.th".format(index))
-            state = torch.load(path)
-            state.pop("metric_tracker")
-            state.pop("batch_num_total")
-            state["val_metric_per_epoch"] = [0.4, 0.1, 0.8]
-            torch.save(state, path)
-
-        next_epoch = trainer._restore_checkpoint()
-        best_epoch = trainer._metric_tracker.best_epoch
-
-        # Loss decreases in 3 epochs, but because we hard fed the val metrics as above:
-        assert next_epoch == 3
-        assert best_epoch == 1
-        assert trainer._metric_tracker._best_so_far == 0.1
-        assert trainer._metric_tracker._epochs_with_no_improvement == 1
-
     def test_trainer_can_run_gradient_accumulation(self):
         instances = list(self.instances)
         steps_to_accumulate = 2
@@ -936,65 +906,6 @@ def test_trainer_can_run_gradient_accumulation(self):
 
         assert num_batches_trained_per_epoch == num_batches_expected
 
-    def test_batch_callback_is_called_at_every_batch(self):
-        class FakeBatchCallback(BatchCallback):
-            def __call__(
-                self,
-                trainer: "GradientDescentTrainer",
-                batch_inputs: List[List[TensorDict]],
-                batch_outputs: List[Dict[str, Any]],
-                batch_metrics: Dict[str, Any],
-                epoch: int,
-                batch_number: int,
-                is_training: bool,
-                is_master: bool,
-            ) -> None:
-                if not hasattr(trainer, "batch_callback_calls"):
-                    trainer.batch_callback_calls = []  # type: ignore
-                trainer.batch_callback_calls.append((epoch, batch_number, is_training))  # type: ignore
-
-        trainer = GradientDescentTrainer(
-            self.model,
-            self.optimizer,
-            self.data_loader,
-            num_epochs=2,
-            validation_data_loader=self.validation_data_loader,
-            batch_callbacks=[FakeBatchCallback()],
-        )
-        trainer.train()
-        expected_calls = [
-            (epoch, batch_number + 1, is_train)
-            for epoch in range(2)
-            for is_train in (True, False)
-            for batch_number in range(len(self.instances) // 2)
-        ]
-        assert trainer.batch_callback_calls == expected_calls
-
-    def test_epoch_callback_is_called_at_every_epoch(self):
-        class FakeEpochCallback(EpochCallback):
-            def __call__(
-                self,
-                trainer: "GradientDescentTrainer",
-                metrics: Dict[str, Any],
-                epoch: int,
-                is_master: bool,
-            ) -> None:
-                if not hasattr(trainer, "epoch_callback_calls"):
-                    trainer.epoch_callback_calls = []  # type: ignore
-                trainer.epoch_callback_calls.append(epoch)  # type: ignore
-
-        trainer = GradientDescentTrainer(
-            self.model,
-            self.optimizer,
-            self.data_loader,
-            num_epochs=4,
-            validation_data_loader=self.validation_data_loader,
-            epoch_callbacks=[FakeEpochCallback()],
-        )
-        trainer.train()
-        expected_calls = [epoch for epoch in range(-1, 4)]
-        assert trainer.epoch_callback_calls == expected_calls
-
     def test_track_epoch_callback(self):
         num_epochs = 4
         trainer = GradientDescentTrainer(
@@ -1003,38 +914,19 @@ def test_track_epoch_callback(self):
             self.data_loader,
             num_epochs=num_epochs,
             validation_data_loader=self.validation_data_loader,
-            epoch_callbacks=[TrackEpochCallback()],
+            callbacks=[TrackEpochCallback(serialization_dir=self.TEST_DIR)],
         )
         trainer.train()
         assert trainer.model.epoch == num_epochs
 
-    def test_end_callback_is_called_at_end(self):
-        class FakeEndCallback(EpochCallback):
-            def __call__(
-                self,
-                trainer: "GradientDescentTrainer",
-                metrics: Dict[str, Any],
-                epoch: int,
-                is_master: bool,
-            ) -> None:
-                if not hasattr(trainer, "end_callback_calls"):
-                    trainer.end_callback_calls = []  # type: ignore
-                trainer.end_callback_calls.append(epoch)  # type: ignore
-
-        trainer = GradientDescentTrainer(
-            self.model,
-            self.optimizer,
-            self.data_loader,
-            num_epochs=4,
-            validation_data_loader=self.validation_data_loader,
-            end_callbacks=[FakeEndCallback()],
-        )
-        trainer.train()
-        expected_calls = [3]
-        assert trainer.end_callback_calls == expected_calls
-
     def test_trainer_callback_is_called_everywhere(self):
         class FakeTrainerCallback(TrainerCallback):
+            def on_start(
+                self, trainer: "GradientDescentTrainer", is_primary: bool = True, **kwargs
+            ) -> None:
+                if not hasattr(trainer, "start_callback_is_fired_first"):
+                    trainer.start_callback_is_fired_first = True  # type: ignore
+
             def on_batch(
                 self,
                 trainer: "GradientDescentTrainer",
@@ -1044,8 +936,12 @@ def on_batch(
                 epoch: int,
                 batch_number: int,
                 is_training: bool,
-                is_master: bool,
+                is_primary: bool = True,
+                **kwargs,
             ) -> None:
+                if not hasattr(trainer, "start_callback_is_fired_first"):
+                    trainer.start_callback_is_fired_first = False  # type: ignore
+
                 if not hasattr(trainer, "batch_callback_calls"):
                     trainer.batch_callback_calls = []  # type: ignore
                 trainer.batch_callback_calls.append((epoch, batch_number, is_training))  # type: ignore
@@ -1055,8 +951,12 @@ def on_epoch(
                 trainer: "GradientDescentTrainer",
                 metrics: Dict[str, Any],
                 epoch: int,
-                is_master: bool,
+                is_primary: bool = True,
+                **kwargs,
             ) -> None:
+                if not hasattr(trainer, "start_callback_is_fired_first"):
+                    trainer.start_callback_is_fired_first = False  # type: ignore
+
                 if not hasattr(trainer, "epoch_callback_calls"):
                     trainer.epoch_callback_calls = []  # type: ignore
                 trainer.epoch_callback_calls.append(epoch)  # type: ignore
@@ -1064,10 +964,14 @@ def on_epoch(
             def on_end(
                 self,
                 trainer: "GradientDescentTrainer",
-                metrics: Dict[str, Any],
-                epoch: int,
-                is_master: bool,
+                metrics: Dict[str, Any] = None,
+                epoch: int = None,
+                is_primary: bool = True,
+                **kwargs,
             ) -> None:
+                if not hasattr(trainer, "start_callback_is_fired_first"):
+                    trainer.start_callback_is_fired_first = False  # type: ignore
+
                 if not hasattr(trainer, "end_callback_calls"):
                     trainer.end_callback_calls = []  # type: ignore
                 trainer.end_callback_calls.append(epoch)  # type: ignore
@@ -1078,7 +982,7 @@ def on_end(
             self.data_loader,
             num_epochs=2,
             validation_data_loader=self.validation_data_loader,
-            trainer_callbacks=[FakeTrainerCallback()],
+            callbacks=[FakeTrainerCallback(serialization_dir=self.TEST_DIR)],
         )
         trainer.train()
         expected_batch_calls = [
@@ -1087,26 +991,21 @@ def on_end(
             for is_train in (True, False)
             for batch_number in range(len(self.instances) // 2)
         ]
-        expected_epoch_calls = [epoch for epoch in range(-1, 2)]
+        expected_epoch_calls = [epoch for epoch in range(0, 2)]
         expected_end_calls = [1]
 
+        assert trainer.start_callback_is_fired_first
         assert trainer.batch_callback_calls == expected_batch_calls
         assert trainer.epoch_callback_calls == expected_epoch_calls
         assert trainer.end_callback_calls == expected_end_calls
 
     def test_total_loss_is_average_of_batch_loss(self):
-
         batches_per_epoch = 3
 
-        data_loader_custom_epoch_lazy = PyTorchDataLoader(
-            self.instances_lazy,
-            batch_size=2,
-            collate_fn=allennlp_collate,
-            batches_per_epoch=batches_per_epoch,
-        )
+        self.data_loader_lazy.batches_per_epoch = 3
 
-        class FakeBatchCallback(BatchCallback):
-            def __call__(
+        class FakeOnBatchCallback(TrainerCallback):
+            def on_batch(
                 self,
                 trainer: "GradientDescentTrainer",
                 batch_inputs: List[List[TensorDict]],
@@ -1115,7 +1014,8 @@ def __call__(
                 epoch: int,
                 batch_number: int,
                 is_training: bool,
-                is_master: bool,
+                is_primary: bool = True,
+                **kwargs,
             ) -> None:
                 if not hasattr(trainer, "batch_losses"):
                     trainer.batch_losses = []  # type: ignore
@@ -1124,9 +1024,9 @@ def __call__(
         trainer = GradientDescentTrainer(
             self.model,
             self.optimizer,
-            data_loader_custom_epoch_lazy,
+            self.data_loader_lazy,
             num_epochs=1,
-            batch_callbacks=[FakeBatchCallback()],
+            callbacks=[FakeOnBatchCallback(serialization_dir=self.TEST_DIR)],
         )
         metrics = trainer.train()
 
diff --git a/tests/training/util_test.py b/tests/training/util_test.py
index 883847067b4..7bf00d2ee3f 100644
--- a/tests/training/util_test.py
+++ b/tests/training/util_test.py
@@ -44,6 +44,7 @@ class TestMakeVocabFromParams(AllenNlpTestCase):
                     "validation_data_path": "path-to-validation-file",
                     "test_data_path": "path-to-validation-file",
                     "datasets_for_vocab_creation": [],
+                    "data_loader": {"batch_size": 2},
                 }
             ),
             Params(
@@ -51,6 +52,7 @@ class TestMakeVocabFromParams(AllenNlpTestCase):
                     "dataset_reader": {"type": "train-util-test-reader"},
                     "train_data_path": "path-to-training-file",
                     "datasets_for_vocab_creation": [],
+                    "data_loader": {"batch_size": 2},
                 }
             ),
             Params(
@@ -60,6 +62,7 @@ class TestMakeVocabFromParams(AllenNlpTestCase):
                     "validation_data_path": "path-to-validation-file",
                     "test_data_path": "path-to-validation-file",
                     "vocabulary": {"type": "empty"},
+                    "data_loader": {"batch_size": 2},
                 }
             ),
         ],
@@ -77,6 +80,7 @@ def test_only_train_read_for_vocab(self, caplog):
             {
                 "dataset_reader": {"type": "train-util-test-reader"},
                 "train_data_path": "path-to-training-file",
+                "data_loader": {"batch_size": 2},
             }
         )
         _ = make_vocab_from_params(params, str(self.TEST_DIR))
@@ -95,6 +99,7 @@ def test_all_datasets_read_for_vocab(self, caplog):
                 "train_data_path": "path-to-training-file",
                 "validation_data_path": "path-to-validation-file",
                 "test_data_path": "path-to-test-file",
+                "data_loader": {"batch_size": 2},
             }
         )
         _ = make_vocab_from_params(params, str(self.TEST_DIR))
@@ -114,6 +119,7 @@ def test_only_specified_datasets_read_for_vocab(self, caplog):
                 "validation_data_path": "path-to-validation-file",
                 "test_data_path": "path-to-test-file",
                 "datasets_for_vocab_creation": ["train", "validation"],
+                "data_loader": {"batch_size": 2},
             }
         )
         _ = make_vocab_from_params(params, str(self.TEST_DIR))
@@ -132,6 +138,7 @@ def test_using_seperate_validation_reader(self, caplog):
                 "validation_dataset_reader": {"type": "train-util-test-reader"},
                 "train_data_path": "path-to-training-file",
                 "validation_data_path": "path-to-validation-file",
+                "data_loader": {"batch_size": 2},
             }
         )
         _ = make_vocab_from_params(params, str(self.TEST_DIR))
@@ -145,6 +152,7 @@ def test_invalid_datasets_for_vocab_creation(self):
                 "train_data_path": "path-to-training-file",
                 "validation_data_path": "path-to-validation-file",
                 "datasets_for_vocab_creation": ["train", "validation", "test"],
+                "data_loader": {"batch_size": 2},
             }
         )
         with pytest.raises(ConfigurationError, match="invalid 'datasets_for_vocab_creation' test"):
@@ -156,6 +164,7 @@ def test_raise_error_if_directory_non_empty(self):
                 "dataset_reader": {"type": "train-util-test-reader"},
                 "train_data_path": "path-to-training-file",
                 "validation_data_path": "path-to-validation-file",
+                "data_loader": {"batch_size": 2},
             }
         )
         os.makedirs(self.TEST_DIR / "vocabulary")