In [1]:
import argparse
import random
import os
import torch
import time
import functools
from maskrcnn_benchmark.config import cfg
from maskrcnn_benchmark.data import make_data_loader
from maskrcnn_benchmark.utils.comm import get_rank
from maskrcnn_benchmark.utils.checkpoint import DetectronCheckpointer
from maskrcnn_benchmark.utils.mlperf_logger import configure_logger
from maskrcnn_benchmark.utils.mlperf_logger import log_start
from maskrcnn_benchmark.modeling.detector import build_detection_model
from maskrcnn_benchmark.solver import make_optimizer
from maskrcnn_benchmark.solver import make_lr_scheduler
from maskrcnn_benchmark.engine.trainer import do_train
from scaleoutbridge import init_bridge, ScaleoutBridge as SBridge

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from mlperf_logging.mllog import constants

In [3]:
import json

In [4]:
configure_logger(constants.MASKRCNN)
num_gpus = int(os.environ["WORLD_SIZE"]) if "WORLD_SIZE" in os.environ else 1
distributed = num_gpus > 1

In [5]:
parser = argparse.ArgumentParser(description="PyTorch Object Detection Training")
parser.add_argument(
        "--config-file",
        default="",
        metavar="FILE",
        help="path to config file",
        type=str,
    )
parser.add_argument("--local_rank", type=int, default=os.getenv('LOCAL_RANK', 0))
parser.add_argument(
    "opts",
    help="Modify config options using the command-line",
    default=None,
    nargs=argparse.REMAINDER,
)

# args = parser.parse_args()

_StoreAction(option_strings=[], dest='opts', nargs='...', const=None, default=None, type=None, choices=None, help='Modify config options using the command-line', metavar=None)

In [6]:
def get_training_world():

    """
    Calculates number of devices in Sagemaker distributed cluster
    """
    
    # Get params of Sagemaker distributed cluster from predefined env variables
    num_gpus = int(os.environ["SM_NUM_GPUS"])
    num_cpus = int(os.environ["SM_NUM_CPUS"])
    hosts = json.loads(os.environ["SM_HOSTS"])
    current_host = os.environ["SM_CURRENT_HOST"]

    # Define PyTorch training world
    world = {}
    world["number_of_processes"] = num_gpus if num_gpus > 0 else num_cpus
    world["number_of_machines"] = len(hosts)
    world["size"] = world["number_of_processes"] * world["number_of_machines"]
    world["machine_rank"] = hosts.index(current_host)
    world["master_addr"] = hosts[0]
    world["master_port"] = "55555" # port is defined by Sagemaker

    return world

In [7]:
os.environ["SM_HPS"]='{"batch-size": "256", "learning-rate": "0.0001","communicator": "pure_nccl"}'
os.environ["SM_NUM_GPUS"]='1'
os.environ["SM_NUM_CPUS"]='32'
os.environ["SM_HOSTS"]='["algo-1","algo-2"]'
os.environ["SM_CURRENT_HOST"]="b51d59bfb99b"

In [18]:
data_dir="/workspace/data/all_data/"
train_script="/workspace/amazon-sagemaker-cv/src/aws_train_mlperf.py"
# unarchive_data(data_dir)
# world = get_training_world()
sm_args = json.loads(os.environ["SM_HPS"])
args = [f"--{key} {value}" for key, value in sm_args.items()]

In [None]:
args = parser.parse_args()

In [20]:
cfg.merge_from_file("configs/e2e_mask_rcnn_R_50_FPN_1x_1_node_test.yaml")
# cfg.merge_from_list(args.opts)
# cfg.freeze()

In [13]:
if cfg.DATALOADER.ALWAYS_PAD_TO_MAX or cfg.USE_CUDA_GRAPH:
    min_size = cfg.INPUT.MIN_SIZE_TRAIN[0] if isinstance(cfg.INPUT.MIN_SIZE_TRAIN, tuple) else cfg.INPUT.MIN_SIZE_TRAIN
    max_size = cfg.INPUT.MAX_SIZE_TRAIN[0] if isinstance(cfg.INPUT.MAX_SIZE_TRAIN, tuple) else cfg.INPUT.MAX_SIZE_TRAIN
    divisibility = max(1, cfg.DATALOADER.SIZE_DIVISIBILITY)
    shapes_per_orientation = cfg.CUDA_GRAPH_NUM_SHAPES_PER_ORIENTATION

    min_size = ((min_size + divisibility - 1) // divisibility) * divisibility
    max_size = ((max_size + divisibility - 1) // divisibility) * divisibility
    size_range = (max_size - min_size) // divisibility

    shapes = []
    for i in range(0,shapes_per_orientation):
        size = min_size + ((i+1) * size_range // shapes_per_orientation) * divisibility
        shapes.append( (min_size, size) )
        shapes.append( (size, min_size) )
    print(shapes)
else:
    shapes = None

[(800, 1344), (1344, 800)]


In [14]:
master_seed = random.SystemRandom().randint(0, 2 ** 32 - 1)
random_number_generator = random.Random(master_seed)
data_loader, iters_per_epoch = make_data_loader(
            cfg,
            is_train=True,
            is_distributed=distributed,
            start_iter=0,
            random_number_generator=random_number_generator,
            seed=master_seed,
            shapes=shapes,
            hybrid_dataloader=None,
        )

When using more than one image per GPU you may encounter an out-of-memory (OOM) error if your GPU does not have sufficient memory. If this happens, you can reduce SOLVER.IMS_PER_BATCH (for training) or TEST.IMS_PER_BATCH (for inference). For training, you must also adjust the learning rate and schedule length according to the linear scaling rule. See for example: https://github.com/facebookresearch/Detectron/blob/master/configs/getting_started/tutorial_1gpu_e2e_faster_rcnn_R-50-FPN.yaml#L14


loading annotations into memory...
Done (t=13.65s)
creating index...
index created!


In [15]:
def mlperf_log_epoch_start(iteration, iters_per_epoch):
    # First iteration:
    #     Note we've started training & tag first epoch start
    if iteration == 0:
        log_start(key=constants.BLOCK_START, metadata={"first_epoch_num":1, "epoch_count":1})
        log_start(key=constants.EPOCH_START, metadata={"epoch_num":1})
        return
    if iteration % iters_per_epoch == 0:
        epoch = iteration // iters_per_epoch + 1
        log_start(key=constants.BLOCK_START, metadata={"first_epoch_num": epoch, "epoch_count": 1})
        log_start(key=constants.EPOCH_START, metadata={"epoch_num": epoch})

In [16]:
model = build_detection_model(cfg)
device = torch.device(cfg.MODEL.DEVICE)
model.to(device)

Error: Apex bottleneck only support nhwc
Error: Apex bottleneck only support nhwc
Error: Apex bottleneck only support nhwc
Error: Apex bottleneck only support nhwc
Error: Apex bottleneck only support nhwc
Error: Apex bottleneck only support nhwc
Error: Apex bottleneck only support nhwc
Error: Apex bottleneck only support nhwc
Error: Apex bottleneck only support nhwc
Error: Apex bottleneck only support nhwc
Error: Apex bottleneck only support nhwc
Error: Apex bottleneck only support nhwc
Error: Apex bottleneck only support nhwc
Error: Apex bottleneck only support nhwc
Error: Apex bottleneck only support nhwc
Error: Apex bottleneck only support nhwc
:::MLLOG {"namespace": "", "time_ms": 1657664530929, "event_type": "POINT_IN_TIME", "key": "weights_initialization", "value": null, "metadata": {"file": "/workspace/maskrcnn/maskrcnn_benchmark/modeling/backbone/fpn.py", "lineno": 56, "tensor": "FPN_inner_block1"}}
:::MLLOG {"namespace": "", "time_ms": 1657664531069, "event_type": "POINT_IN_TI

GeneralizedRCNN(
  (graphable): Graphable(
    (backbone): Sequential(
      (body): ResNet(
        (stem): StemWithFixedBatchNorm(
          (_base_stem): _BaseStem(
            (conv1): RecursiveScriptModule(original_name=Conv2d)
            (bn1): FrozenBatchNorm2d()
          )
          (max_pool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
        )
        (layer1): Sequential(
          (0): FastBottleneckWithFixedBatchNorm(
            (downsample): Sequential(
              (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
              (1): FrozenBatchNorm2d()
            )
            (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
            (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (relu): ReLU(inplace=True)
            (bn1): FrozenBatchNorm2d()
            (

In [17]:
world_size = 1
dedicated_evaluation_ranks = max(0,cfg.DEDICATED_EVALUATION_RANKS)
num_training_ranks = world_size - dedicated_evaluation_ranks
images_per_gpu_train = cfg.SOLVER.IMS_PER_BATCH

In [18]:
images_per_gpu_train

96

In [19]:
model.train()
optimizer = make_optimizer(cfg, model)
scheduler = make_lr_scheduler(cfg, optimizer)
checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD

In [20]:
arguments = {}
arguments["iteration"] = 0
arguments["nhwc"] = cfg.NHWC
arguments['ims_per_batch'] = cfg.SOLVER.IMS_PER_BATCH
arguments["distributed"] = distributed
arguments["max_annotations_per_image"] = cfg.DATALOADER.MAX_ANNOTATIONS_PER_IMAGE
arguments["dedicated_evaluation_ranks"] = dedicated_evaluation_ranks
arguments["num_training_ranks"] = num_training_ranks
arguments["training_comm"] = None if dedicated_evaluation_ranks == 0 else training_comm
arguments["images_per_gpu_train"] = images_per_gpu_train
arguments["use_synthetic_input"] = cfg.DATALOADER.USE_SYNTHETIC_INPUT
assert not (cfg.DATALOADER.USE_SYNTHETIC_INPUT and cfg.DATALOADER.HYBRID), "USE_SYNTHETIC_INPUT and HYBRID can't both be used together"
arguments["enable_nsys_profiling"] = cfg.ENABLE_NSYS_PROFILING
output_dir = cfg.OUTPUT_DIR

save_to_disk = get_rank() == 0
checkpointer = DetectronCheckpointer(
    cfg, model, optimizer, scheduler, output_dir, save_to_disk
)
arguments["save_checkpoints"] = cfg.SAVE_CHECKPOINTS

extra_checkpoint_data = checkpointer.load(cfg.MODEL.WEIGHT, cfg.NHWC)
arguments.update(extra_checkpoint_data)



In [21]:
per_iter_callback_fn = None
final_callback_fn=None
rank=0

In [22]:
success = do_train(
            model,
            data_loader,
            optimizer,
            scheduler,
            checkpointer,
            device,
            checkpoint_period,
            arguments,
            cfg.DISABLE_REDUCED_LOGGING,
            cfg.DISABLE_LOSS_LOGGING,
            per_iter_start_callback_fn=functools.partial(mlperf_log_epoch_start, iters_per_epoch=iters_per_epoch),
            per_iter_end_callback_fn=per_iter_callback_fn,
            final_callback_fn=final_callback_fn,
            rank=rank
        )

:::MLLOG {"namespace": "", "time_ms": 1657664537994, "event_type": "INTERVAL_START", "key": "block_start", "value": null, "metadata": {"file": "/tmp/ipykernel_18863/677983664.py", "lineno": 5, "first_epoch_num": 1, "epoch_count": 1}}
:::MLLOG {"namespace": "", "time_ms": 1657664538002, "event_type": "INTERVAL_START", "key": "epoch_start", "value": null, "metadata": {"file": "/tmp/ipykernel_18863/677983664.py", "lineno": 6, "epoch_num": 1}}


RuntimeError: The following operation failed in the TorchScript interpreter.
Traceback of TorchScript (most recent call last):
  File "/workspace/maskrcnn/maskrcnn_benchmark/modeling/backbone/resnet.py", line 361, in forward
    @torch.jit.script_method
    def forward(self, x):
        x = self.conv1(x)
            ~~~~~~~~~~ <--- HERE
        x = self.bn1(x)
        x = F.relu(x)
RuntimeError: AttributeError: 'RecursiveScriptModule' object has no attribute '_conv_forward'

At:
  /opt/conda/lib/python3.8/site-packages/torch/nn/modules/module.py(1186): __getattr__
  /opt/conda/lib/python3.8/site-packages/torch/jit/_script.py(481): __getattr__
  /opt/conda/lib/python3.8/site-packages/torch/jit/_script.py(764): __getattr__
  /opt/conda/lib/python3.8/site-packages/torch/nn/modules/conv.py(447): forward
  /workspace/maskrcnn/maskrcnn_benchmark/layers/misc.py(34): forward
  /opt/conda/lib/python3.8/site-packages/torch/jit/_recursive.py(899): lazy_binding_method
  /opt/conda/lib/python3.8/site-packages/torch/nn/modules/module.py(1111): _call_impl
  /workspace/maskrcnn/maskrcnn_benchmark/modeling/backbone/resnet.py(380): forward
  /opt/conda/lib/python3.8/site-packages/torch/nn/modules/module.py(1111): _call_impl
  /workspace/maskrcnn/maskrcnn_benchmark/modeling/backbone/resnet.py(161): forward
  /opt/conda/lib/python3.8/site-packages/torch/nn/modules/module.py(1111): _call_impl
  /opt/conda/lib/python3.8/site-packages/torch/nn/modules/container.py(139): forward
  /opt/conda/lib/python3.8/site-packages/torch/nn/modules/module.py(1111): _call_impl
  /workspace/maskrcnn/maskrcnn_benchmark/modeling/detector/generalized_rcnn.py(30): forward
  /opt/conda/lib/python3.8/site-packages/torch/nn/modules/module.py(1111): _call_impl
  /workspace/maskrcnn/maskrcnn_benchmark/modeling/detector/generalized_rcnn.py(393): forward
  /opt/conda/lib/python3.8/site-packages/torch/nn/modules/module.py(1111): _call_impl
  /workspace/maskrcnn/maskrcnn_benchmark/engine/trainer.py(382): do_train
  /tmp/ipykernel_18863/741175699.py(1): <cell line: 1>
  /opt/conda/lib/python3.8/site-packages/IPython/core/interactiveshell.py(3398): run_code
  /opt/conda/lib/python3.8/site-packages/IPython/core/interactiveshell.py(3338): run_ast_nodes
  /opt/conda/lib/python3.8/site-packages/IPython/core/interactiveshell.py(3135): run_cell_async
  /opt/conda/lib/python3.8/site-packages/IPython/core/async_helpers.py(129): _pseudo_sync_runner
  /opt/conda/lib/python3.8/site-packages/IPython/core/interactiveshell.py(2936): _run_cell
  /opt/conda/lib/python3.8/site-packages/IPython/core/interactiveshell.py(2881): run_cell
  /opt/conda/lib/python3.8/site-packages/ipykernel/zmqshell.py(528): run_cell
  /opt/conda/lib/python3.8/site-packages/ipykernel/ipkernel.py(383): do_execute
  /opt/conda/lib/python3.8/site-packages/ipykernel/kernelbase.py(724): execute_request
  /opt/conda/lib/python3.8/site-packages/ipykernel/kernelbase.py(400): dispatch_shell
  /opt/conda/lib/python3.8/site-packages/ipykernel/kernelbase.py(493): process_one
  /opt/conda/lib/python3.8/site-packages/ipykernel/kernelbase.py(504): dispatch_queue
  /opt/conda/lib/python3.8/asyncio/events.py(81): _run
  /opt/conda/lib/python3.8/asyncio/base_events.py(1859): _run_once
  /opt/conda/lib/python3.8/asyncio/base_events.py(570): run_forever
  /opt/conda/lib/python3.8/site-packages/tornado/platform/asyncio.py(199): start
  /opt/conda/lib/python3.8/site-packages/ipykernel/kernelapp.py(712): start
  /opt/conda/lib/python3.8/site-packages/traitlets/config/application.py(846): launch_instance
  /opt/conda/lib/python3.8/site-packages/ipykernel_launcher.py(17): <module>
  /opt/conda/lib/python3.8/runpy.py(87): _run_code
  /opt/conda/lib/python3.8/runpy.py(194): _run_module_as_main



In [48]:
start_iter = arguments["iteration"]
prefetcher = Prefetcher(data_loader, device, arguments["max_annotations_per_image"])
overflow_buf = None

In [6]:
import argparse
import os
import json

In [8]:
os.environ["SM_HPS"] = '{"batch-size": "256", "learning-rate": "0.0001","communicator": "pure_nccl"}'

In [9]:
sm_args = json.loads(os.environ["SM_HPS"])
args = [f"--{key} {value}" for key, value in sm_args.items()]

In [10]:
parser = argparse.ArgumentParser(description="PyTorch Object Detection Training")
parser.add_argument(
    "--config-file",
    default="",
    metavar="FILE",
    help="path to config file",
    type=str,
)
parser.add_argument("--local_rank", type=int, default=os.getenv('LOCAL_RANK', 0))
parser.add_argument(
    "opts",
    help="Modify config options using the command-line",
    default=None,
    nargs=argparse.REMAINDER,
)


args = parser.parse_args()

usage: ipykernel_launcher.py [-h] [--config-file FILE]
                             [--local_rank LOCAL_RANK]
                             ...
ipykernel_launcher.py: error: unrecognized arguments: -f


SystemExit: 2

In [9]:
import pytorch_lightning as pl
from rcnn_lightning import LightningGeneralizedRCNN

In [22]:
model = LightningGeneralizedRCNN(cfg)
trainer=pl.Trainer()

Error: Apex bottleneck only support nhwc
Error: Apex bottleneck only support nhwc
Error: Apex bottleneck only support nhwc
Error: Apex bottleneck only support nhwc
Error: Apex bottleneck only support nhwc
Error: Apex bottleneck only support nhwc
Error: Apex bottleneck only support nhwc
Error: Apex bottleneck only support nhwc
Error: Apex bottleneck only support nhwc
Error: Apex bottleneck only support nhwc
Error: Apex bottleneck only support nhwc
Error: Apex bottleneck only support nhwc
Error: Apex bottleneck only support nhwc
Error: Apex bottleneck only support nhwc
Error: Apex bottleneck only support nhwc
Error: Apex bottleneck only support nhwc
:::MLLOG {"namespace": "", "time_ms": 1657820038298, "event_type": "POINT_IN_TIME", "key": "weights_initialization", "value": null, "metadata": {"file": "/workspace/maskrcnn/maskrcnn_benchmark/modeling/backbone/fpn.py", "lineno": 56, "tensor": "FPN_inner_block1"}}
:::MLLOG {"namespace": "", "time_ms": 1657820038416, "event_type": "POINT_IN_TI

AttributeError: cannot assign module before Module.__init__() call

In [3]:
trainer.fit(model)

NameError: name 'LightningGeneralizedRCNN' is not defined