In [1]:
import argparse
import random
import os
import torch
import time
import functools
from maskrcnn_benchmark.config import cfg
from maskrcnn_benchmark.data import make_data_loader
from maskrcnn_benchmark.utils.comm import get_rank
from maskrcnn_benchmark.utils.checkpoint import DetectronCheckpointer
from maskrcnn_benchmark.utils.mlperf_logger import configure_logger
from maskrcnn_benchmark.utils.mlperf_logger import log_start
from maskrcnn_benchmark.modeling.detector import build_detection_model
from maskrcnn_benchmark.solver import make_optimizer
from maskrcnn_benchmark.solver import make_lr_scheduler
from maskrcnn_benchmark.engine.trainer import do_train
from scaleoutbridge import init_bridge, ScaleoutBridge as SBridge

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from mlperf_logging.mllog import constants

In [3]:
import json

In [4]:
from maskrcnn_benchmark.config import cfg


In [5]:
configure_logger(constants.MASKRCNN)
num_gpus = int(os.environ["WORLD_SIZE"]) if "WORLD_SIZE" in os.environ else 1
distributed = num_gpus > 1

In [6]:
parser = argparse.ArgumentParser(description="PyTorch Object Detection Training")
parser.add_argument(
        "--config-file",
        default="",
        metavar="FILE",
        help="path to config file",
        type=str,
    )
parser.add_argument("--local_rank", type=int, default=os.getenv('LOCAL_RANK', 0))
parser.add_argument(
    "opts",
    help="Modify config options using the command-line",
    default=None,
    nargs=argparse.REMAINDER,
)

# args = parser.parse_args()

_StoreAction(option_strings=[], dest='opts', nargs='...', const=None, default=None, type=None, choices=None, help='Modify config options using the command-line', metavar=None)

In [7]:
cfg.merge_from_file("configs/e2e_mask_rcnn_R_50_FPN_1x_1_node_test.yaml")
# cfg.merge_from_list(args.opts)
# cfg.freeze()

In [8]:
num_gpus = int(os.environ["WORLD_SIZE"]) if "WORLD_SIZE" in os.environ else 1
args_distributed= num_gpus >1
if args_distributed:
    torch.cuda.set_device(args.local_rank)
    torch.distributed.init_process_group(
        backend="nccl", init_method="env://"
    )
    world_size = torch.distributed.get_world_size()
    rank = torch.distributed.get_rank()
    # setting seeds - needs to be timed, so after RUN_START
    if is_main_process():
        master_seed = random.SystemRandom().randint(0, 2 ** 32 - 1)
        seed_tensor = torch.tensor(master_seed, dtype=torch.float32, device=torch.device("cuda"))
    else:
        seed_tensor = torch.tensor(0, dtype=torch.float32, device=torch.device("cuda"))

    torch.distributed.broadcast(seed_tensor, 0)
    master_seed = int(seed_tensor.item())
else:
    world_size = 1
    rank = 0
    # random master seed, random.SystemRandom() uses /dev/urandom on Unix
    master_seed = random.SystemRandom().randint(0, 2 ** 32 - 1)

In [9]:
dedicated_evaluation_ranks = max(0,cfg.DEDICATED_EVALUATION_RANKS)
num_training_ranks = world_size - dedicated_evaluation_ranks
num_evaluation_ranks = world_size if dedicated_evaluation_ranks == 0 else dedicated_evaluation_ranks

images_per_gpu_train = cfg.SOLVER.IMS_PER_BATCH // num_training_ranks
images_per_gpu_test = cfg.TEST.IMS_PER_BATCH // num_evaluation_ranks

In [10]:
arguments = {}
arguments["iteration"] = 0
arguments["nhwc"] = cfg.NHWC
arguments['ims_per_batch'] = cfg.SOLVER.IMS_PER_BATCH
arguments["distributed"] = distributed
arguments["max_annotations_per_image"] = cfg.DATALOADER.MAX_ANNOTATIONS_PER_IMAGE
arguments["dedicated_evaluation_ranks"] = dedicated_evaluation_ranks
arguments["num_training_ranks"] = num_training_ranks
arguments["training_comm"] = None if dedicated_evaluation_ranks == 0 else training_comm
arguments["images_per_gpu_train"] = images_per_gpu_train
arguments["use_synthetic_input"] = cfg.DATALOADER.USE_SYNTHETIC_INPUT
assert not (cfg.DATALOADER.USE_SYNTHETIC_INPUT and cfg.DATALOADER.HYBRID), "USE_SYNTHETIC_INPUT and HYBRID can't both be used together"
arguments["enable_nsys_profiling"] = cfg.ENABLE_NSYS_PROFILING
output_dir = cfg.OUTPUT_DIR

# save_to_disk = get_rank() == 0
# checkpointer = DetectronCheckpointer(
#     cfg, model, optimizer, scheduler, output_dir, save_to_disk
# )
# arguments["save_checkpoints"] = cfg.SAVE_CHECKPOINTS

# extra_checkpoint_data = checkpointer.load(cfg.MODEL.WEIGHT, cfg.NHWC)
# arguments.update(extra_checkpoint_data)


In [11]:
import pytorch_lightning as pl
from rcnn_lightning import LightningGeneralizedRCNN

In [12]:
model = LightningGeneralizedRCNN(cfg, arguments)


:::MLLOG {"namespace": "", "time_ms": 1657926824575, "event_type": "POINT_IN_TIME", "key": "weights_initialization", "value": null, "metadata": {"file": "/workspace/maskrcnn/maskrcnn_benchmark/modeling/backbone/fpn.py", "lineno": 56, "tensor": "FPN_inner_block1"}}
:::MLLOG {"namespace": "", "time_ms": 1657926824723, "event_type": "POINT_IN_TIME", "key": "weights_initialization", "value": null, "metadata": {"file": "/workspace/maskrcnn/maskrcnn_benchmark/modeling/backbone/fpn.py", "lineno": 59, "tensor": "FPN_layer_block1"}}
:::MLLOG {"namespace": "", "time_ms": 1657926824726, "event_type": "POINT_IN_TIME", "key": "weights_initialization", "value": null, "metadata": {"file": "/workspace/maskrcnn/maskrcnn_benchmark/modeling/backbone/fpn.py", "lineno": 56, "tensor": "FPN_inner_block2"}}
:::MLLOG {"namespace": "", "time_ms": 1657926824735, "event_type": "POINT_IN_TIME", "key": "weights_initialization", "value": null, "metadata": {"file": "/workspace/maskrcnn/maskrcnn_benchmark/modeling/bac



In [13]:
model.to('cuda')

LightningGeneralizedRCNN(
  (model): GeneralizedRCNN(
    (graphable): Graphable(
      (backbone): Sequential(
        (body): ResNet(
          (stem): StemWithFixedBatchNorm(
            (_base_stem): _BaseStem(
              (conv1): RecursiveScriptModule(original_name=Conv2d_NHWC)
              (bn1): FrozenBatchNorm2d_NHWC()
            )
            (max_pool): MaxPool2d_NHWC(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
          )
          (layer1): Sequential(
            (0): FastBottleneckWithFixedBatchNorm(
              (downsample): Sequential(
                (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
                (1): FrozenBatchNorm2d()
              )
              (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
              (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
              (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
 

In [14]:
list(model.parameters())[0].device

device(type='cuda', index=0)

In [15]:
trainer=pl.Trainer(accelerator="gpu", devices=1)
# trainer=pl.Trainer()

  rank_zero_warn(
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [16]:
trainer.fit(model)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]


MisconfigurationException: Unknown configuration for model optimizers. Output from `model.configure_optimizers()` should be one of:
 * `Optimizer`
 * [`Optimizer`]
 * ([`Optimizer`], [`_LRScheduler`])
 * {"optimizer": `Optimizer`, (optional) "lr_scheduler": `_LRScheduler`}
 * A list of the previously described dict format, with an optional "frequency" key (int)

In [None]:
list(model.parameters())[0].device