albert example (#1326)

catalyst-team · Oct 11, 2021 · 84ad525 · 84ad525
1 parent bcb0bfc
commit 84ad525
Show file tree

Hide file tree

Showing 6 changed files with 248 additions and 60 deletions.
diff --git a/examples/engines/README.md b/examples/engines/README.md
@@ -3,34 +3,55 @@
 Let's check different
 DataParallel and DistributedDataParallel multi-GPU setups with Catalyst Engines. 
 
+> Note: for the Albert training please install requirements with ``pip install datasets transformers``.
+
 ## Core
 
 ### PyTorch
 ```bash
 pip install catalyst
 
-CUDA_VISIBLE_DEVICES="0" python train.py --engine=de
-CUDA_VISIBLE_DEVICES="0,1" python train.py --engine=dp
-CUDA_VISIBLE_DEVICES="0,1" python train.py --engine=ddp
-CUDA_VISIBLE_DEVICES="0,1" python train.py --engine=ddp --sync-bn
+# CV - ResNet
+CUDA_VISIBLE_DEVICES="0" python train_resnet.py --engine=de
+CUDA_VISIBLE_DEVICES="0,1" python train_resnet.py --engine=dp
+CUDA_VISIBLE_DEVICES="0,1" python train_resnet.py --engine=ddp
+CUDA_VISIBLE_DEVICES="0,1" python train_resnet.py --engine=ddp --sync-bn
+
+# NLP - Albert
+pip install datasets transformers
+CUDA_VISIBLE_DEVICES="0" python train_albert.py --engine=de
+CUDA_VISIBLE_DEVICES="0,1" python train_albert.py --engine=dp
+CUDA_VISIBLE_DEVICES="0,1" python train_albert.py --engine=ddp
 ```
 
 ### PyTorch AMP
 ```bash
 pip install torch>=1.8.0 catalyst
 
-CUDA_VISIBLE_DEVICES="0,1" python train.py --engine=amp-dp
-CUDA_VISIBLE_DEVICES="0,1" python train.py --engine=amp-ddp
-CUDA_VISIBLE_DEVICES="0,1" python train.py --engine=amp-ddp --sync-bn
+# CV - ResNet
+CUDA_VISIBLE_DEVICES="0,1" python train_resnet.py --engine=amp-dp
+CUDA_VISIBLE_DEVICES="0,1" python train_resnet.py --engine=amp-ddp
+CUDA_VISIBLE_DEVICES="0,1" python train_resnet.py --engine=amp-ddp --sync-bn
+
+# NLP - Albert
+pip install datasets transformers
+CUDA_VISIBLE_DEVICES="0,1" python train_albert.py --engine=amp-dp
+CUDA_VISIBLE_DEVICES="0,1" python train_albert.py --engine=amp-ddp
 ```
 
 ### PyTorch XLA
 ```bash
 pip install catalyst
 pip install cloud-tpu-client==0.10 https://storage.googleapis.com/tpu-pytorch/wheels/torch_xla-1.9-cp37-cp37m-linux_x86_64.whl
 
-python train.py --engine=xla
-python train.py --engine=xla-ddp
+# CV - ResNet
+python train_resnet.py --engine=xla
+python train_resnet.py --engine=xla-ddp
+
+# NLP - Albert
+pip install datasets transformers
+CUDA_VISIBLE_DEVICES="0,1" python train_albert.py --engine=xla
+CUDA_VISIBLE_DEVICES="0,1" python train_albert.py --engine=xla-ddp
 ```
 
 ## Extensions
@@ -40,9 +61,15 @@ python train.py --engine=xla-ddp
 pip install catalyst && install-apex
 # or git clone https://github.com/NVIDIA/apex && cd apex && pip install -e .
 
-CUDA_VISIBLE_DEVICES="0,1" python train.py --engine=apex-dp
-CUDA_VISIBLE_DEVICES="0,1" python train.py --engine=apex-ddp
-CUDA_VISIBLE_DEVICES="0,1" python train.py --engine=apex-ddp --sync-bn
+# CV - ResNet
+CUDA_VISIBLE_DEVICES="0,1" python train_resnet.py --engine=apex-dp
+CUDA_VISIBLE_DEVICES="0,1" python train_resnet.py --engine=apex-ddp
+CUDA_VISIBLE_DEVICES="0,1" python train_resnet.py --engine=apex-ddp --sync-bn
+
+# NLP - Albert
+pip install datasets transformers
+CUDA_VISIBLE_DEVICES="0,1" python train_albert.py --engine=apex-dp
+CUDA_VISIBLE_DEVICES="0,1" python train_albert.py --engine=apex-ddp
 ```
 
 ### DeepSpeed
@@ -52,19 +79,32 @@ CUDA_VISIBLE_DEVICES="0,1" python train.py --engine=apex-ddp --sync-bn
 # docker run --rm -it -v $(pwd):/workspace deepspeed/deepspeed:v031_torch17_cuda11 /bin/bash
 pip install catalyst[deepspeed]
 
-CUDA_VISIBLE_DEVICES="0,1" python train.py --engine=ds-ddp
+# CV - ResNet
+CUDA_VISIBLE_DEVICES="0,1" python train_resnet.py --engine=ds-ddp
+
+# NLP - Albert
+pip install datasets transformers
+CUDA_VISIBLE_DEVICES="0,1" python train_albert.py --engine=ds-ddp
 ```
 
 ### FairScale
 > *Tested under `pip install -U torch==1.8.1 fairscale==0.3.7 catalyst==21.06`*
 ```bash
 pip install torch>=1.8.0 catalyst[fairscale]
 
-CUDA_VISIBLE_DEVICES="0,1" python train.py --engine=fs-pp
-CUDA_VISIBLE_DEVICES="0,1" python train.py --engine=fs-ddp
-CUDA_VISIBLE_DEVICES="0,1" python train.py --engine=fs-ddp --sync-bn
-CUDA_VISIBLE_DEVICES="0,1" python train.py --engine=fs-ddp-amp
-CUDA_VISIBLE_DEVICES="0,1" python train.py --engine=fs-ddp-amp --sync-bn
-CUDA_VISIBLE_DEVICES="0,1" python train.py --engine=fs-fddp
-CUDA_VISIBLE_DEVICES="0,1" python train.py --engine=fs-fddp --sync-bn
+# CV - ResNet
+CUDA_VISIBLE_DEVICES="0,1" python train_resnet.py --engine=fs-pp
+CUDA_VISIBLE_DEVICES="0,1" python train_resnet.py --engine=fs-ddp
+CUDA_VISIBLE_DEVICES="0,1" python train_resnet.py --engine=fs-ddp --sync-bn
+CUDA_VISIBLE_DEVICES="0,1" python train_resnet.py --engine=fs-ddp-amp
+CUDA_VISIBLE_DEVICES="0,1" python train_resnet.py --engine=fs-ddp-amp --sync-bn
+CUDA_VISIBLE_DEVICES="0,1" python train_resnet.py --engine=fs-fddp
+CUDA_VISIBLE_DEVICES="0,1" python train_resnet.py --engine=fs-fddp --sync-bn
+
+# NLP - Albert
+pip install datasets transformers
+CUDA_VISIBLE_DEVICES="0,1" python train_albert.py --engine=fs-pp
+CUDA_VISIBLE_DEVICES="0,1" python train_albert.py --engine=fs-ddp
+CUDA_VISIBLE_DEVICES="0,1" python train_albert.py --engine=fs-ddp-amp
+CUDA_VISIBLE_DEVICES="0,1" python train_albert.py --engine=fs-fddp
 ```
diff --git a/examples/engines/common.py b/examples/engines/common.py
@@ -0,0 +1,38 @@
+from functools import partial
+
+from catalyst import dl, SETTINGS
+
+E2E = {
+    "de": dl.DeviceEngine,
+    "dp": dl.DataParallelEngine,
+    "ddp": dl.DistributedDataParallelEngine,
+}
+
+if SETTINGS.amp_required:
+    E2E.update(
+        {"amp-dp": dl.DataParallelAMPEngine, "amp-ddp": dl.DistributedDataParallelAMPEngine}
+    )
+
+if SETTINGS.apex_required:
+    E2E.update(
+        {"apex-dp": dl.DataParallelAPEXEngine, "apex-ddp": dl.DistributedDataParallelAPEXEngine}
+    )
+
+if SETTINGS.deepspeed_required:
+    E2E.update({"ds-ddp": dl.DistributedDataParallelDeepSpeedEngine})
+
+if SETTINGS.fairscale_required:
+    E2E.update(
+        {
+            "fs-pp": dl.PipelineParallelFairScaleEngine,
+            "fs-ddp": dl.SharedDataParallelFairScaleEngine,
+            "fs-ddp-amp": dl.SharedDataParallelFairScaleAMPEngine,
+            # for some reason we could catch a bug with FairScale flatten wrapper here, so...
+            "fs-fddp": partial(
+                dl.FullySharedDataParallelFairScaleEngine, ddp_kwargs={"flatten_parameters": False}
+            ),
+        }
+    )
+
+if SETTINGS.xla_required:
+    E2E.update({"xla": dl.XLAEngine, "xla-ddp": dl.DistributedXLAEngine})
diff --git a/examples/engines/train_albert.py b/examples/engines/train_albert.py
@@ -0,0 +1,142 @@
+#!/usr/bin/env python
+# flake8: noqa
+from argparse import ArgumentParser, RawTextHelpFormatter
+
+from common import E2E
+
+from datasets import load_dataset
+from torch import nn, optim
+from torch.utils.data import DataLoader
+from torch.utils.data.distributed import DistributedSampler
+from transformers import AutoModelForSequenceClassification, AutoTokenizer, get_scheduler
+
+from catalyst import dl
+
+
+class CustomRunner(dl.IRunner):
+    def __init__(self, logdir: str, engine: str):
+        super().__init__()
+        self._logdir = logdir
+        self._engine = engine
+
+    def get_engine(self):
+        return E2E[self._engine]()
+
+    def get_loggers(self):
+        return {
+            "console": dl.ConsoleLogger(),
+            "csv": dl.CSVLogger(logdir=self._logdir),
+            "tensorboard": dl.TensorboardLogger(logdir=self._logdir),
+        }
+
+    @property
+    def stages(self):
+        return ["train"]
+
+    def get_stage_len(self, stage: str) -> int:
+        return 10
+
+    def get_loaders(self, stage: str):
+        datasets = load_dataset("glue", "sst2")
+        tokenizer = AutoTokenizer.from_pretrained("albert-base-v2")
+        encoded_datasets = datasets.map(
+            lambda examples: tokenizer(
+                examples["sentence"],
+                max_length=128,
+                truncation=True,
+                padding="max_length",
+            ),
+            batched=True,
+        )
+        encoded_datasets = encoded_datasets.map(lambda x: {"labels": x["label"]})
+        encoded_datasets.set_format(
+            type="torch", columns=["input_ids", "attention_mask", "labels"]
+        )
+
+        train_data = encoded_datasets["train"]
+        valid_data = encoded_datasets["validation"]
+
+        if self.engine.is_ddp:
+            train_sampler = DistributedSampler(
+                train_data,
+                num_replicas=self.engine.world_size,
+                rank=self.engine.rank,
+                shuffle=True,
+            )
+            valid_sampler = DistributedSampler(
+                valid_data,
+                num_replicas=self.engine.world_size,
+                rank=self.engine.rank,
+                shuffle=False,
+            )
+        else:
+            train_sampler = valid_sampler = None
+
+        self.train_loader_len = len(DataLoader(train_data, batch_size=64, sampler=train_sampler))
+
+        return {
+            "train": DataLoader(train_data, batch_size=64, sampler=train_sampler),
+            "valid": DataLoader(valid_data, batch_size=32, sampler=valid_sampler),
+        }
+
+    def get_model(self, stage: str):
+        model = (
+            self.model
+            if self.model is not None
+            else AutoModelForSequenceClassification.from_pretrained("albert-base-v2")
+        )
+        return model
+
+    def get_criterion(self, stage: str):
+        return nn.CrossEntropyLoss()
+
+    def get_optimizer(self, stage: str, model):
+        return optim.Adam(model.parameters(), lr=3e-5)
+
+    def get_scheduler(self, stage: str, optimizer):
+        scheduler = get_scheduler(
+            "linear",
+            optimizer=optimizer,
+            num_warmup_steps=int(0.05 * self.train_loader_len) * self.stage_epoch_len,
+            num_training_steps=self.train_loader_len * self.stage_epoch_len,
+        )
+        return scheduler
+
+    def get_callbacks(self, stage: str):
+        return {
+            "criterion": dl.CriterionCallback(
+                input_key="logits", target_key="labels", metric_key="loss"
+            ),
+            "optimizer": dl.OptimizerCallback(metric_key="loss"),
+            "scheduler": dl.SchedulerCallback(loader_key="valid", metric_key="loss", mode="batch"),
+            "accuracy": dl.AccuracyCallback(
+                input_key="logits", target_key="labels", topk_args=(1,)
+            ),
+            "checkpoint": dl.CheckpointCallback(
+                self._logdir,
+                loader_key="valid",
+                metric_key="accuracy",
+                minimize=False,
+                save_n_best=1,
+            ),
+            # "tqdm": dl.TqdmCallback(),
+        }
+
+    def handle_batch(self, batch):
+        outputs = self.model(**batch)
+
+        self.batch = {
+            "features": batch["input_ids"],
+            "labels": batch["labels"],
+            "logits": outputs.logits,
+        }
+
+
+if __name__ == "__main__":
+    parser = ArgumentParser(formatter_class=RawTextHelpFormatter)
+    parser.add_argument("--logdir", type=str, default=None)
+    parser.add_argument("--engine", type=str, choices=list(E2E.keys()))
+    args, _ = parser.parse_known_args()
+    args.logdir = args.logdir or f"logs_albert_{args.engine}".replace("-", "_")
+    runner = CustomRunner(args.logdir, args.engine)
+    runner.run()
diff --git a/examples/engines/train.py → examples/engines/train_resnet.py b/examples/engines/train.py → examples/engines/train_resnet.py
@@ -1,53 +1,19 @@
 #!/usr/bin/env python
 # flake8: noqa
 from argparse import ArgumentParser, RawTextHelpFormatter
-from functools import partial
 import os
 
+from common import E2E
+
 from torch import nn, optim
 from torch.utils.data import DataLoader
 from torch.utils.data.distributed import DistributedSampler
 
-from catalyst import dl, SETTINGS, utils
+from catalyst import dl, utils
 from catalyst.contrib.datasets import CIFAR10
 from catalyst.contrib.nn import ResidualBlock
 from catalyst.data import transforms
 
-E2E = {
-    "de": dl.DeviceEngine,
-    "dp": dl.DataParallelEngine,
-    "ddp": dl.DistributedDataParallelEngine,
-}
-
-if SETTINGS.amp_required:
-    E2E.update(
-        {"amp-dp": dl.DataParallelAMPEngine, "amp-ddp": dl.DistributedDataParallelAMPEngine}
-    )
-
-if SETTINGS.apex_required:
-    E2E.update(
-        {"apex-dp": dl.DataParallelAPEXEngine, "apex-ddp": dl.DistributedDataParallelAPEXEngine}
-    )
-
-if SETTINGS.deepspeed_required:
-    E2E.update({"ds-ddp": dl.DistributedDataParallelDeepSpeedEngine})
-
-if SETTINGS.fairscale_required:
-    E2E.update(
-        {
-            "fs-pp": dl.PipelineParallelFairScaleEngine,
-            "fs-ddp": dl.SharedDataParallelFairScaleEngine,
-            "fs-ddp-amp": dl.SharedDataParallelFairScaleAMPEngine,
-            # for some reason we could catch a bug with FairScale flatten wrapper here, so...
-            "fs-fddp": partial(
-                dl.FullySharedDataParallelFairScaleEngine, ddp_kwargs={"flatten_parameters": False}
-            ),
-        }
-    )
-
-if SETTINGS.xla_required:
-    E2E.update({"xla": dl.XLAEngine, "xla-ddp": dl.DistributedXLAEngine})
-
 
 def conv_block(in_channels, out_channels, pool=False):
     layers = [
@@ -154,6 +120,7 @@ def get_callbacks(self, stage: str):
                 minimize=False,
                 save_n_best=1,
             ),
+            # "tqdm": dl.TqdmCallback(),
         }
 
     def handle_batch(self, batch):
@@ -172,6 +139,8 @@ def handle_batch(self, batch):
     parser.add_argument("--engine", type=str, choices=list(E2E.keys()))
     utils.boolean_flag(parser, "sync-bn", default=False)
     args, _ = parser.parse_known_args()
-    args.logdir = args.logdir or f"logs_{args.engine}_sbn{int(args.sync_bn)}".replace("-", "_")
+    args.logdir = args.logdir or f"logs_resnet_{args.engine}_sbn{int(args.sync_bn)}".replace(
+        "-", "_"
+    )
     runner = CustomRunner(args.logdir, args.engine, args.sync_bn)
     runner.run()
diff --git a/examples/self_supervised/common.py b/examples/self_supervised/common.py
@@ -1,7 +1,6 @@
 from typing import Dict, Optional
 
 from datasets import datasets
-
 import torch
 from torch.utils.data import DataLoader
 

diff --git a/setup.cfg b/setup.cfg
@@ -36,4 +36,4 @@ use_parentheses = true
 #  - dl libs (known_dl)
 #  - catalyst imports
 sections = FUTURE,STDLIB,THIRDPARTY,DL,FIRSTPARTY,LOCALFOLDER
-known_dl = albumentations,gym,gym_minigrid,kornia,mlflow,neptune,tensorboard,tensorboardX,tensorflow,torch,torchvision,transformers,wandb
+known_dl = albumentations,datasets,gym,gym_minigrid,kornia,mlflow,neptune,tensorboard,tensorboardX,tensorflow,torch,torchvision,transformers,wandb