In [1]:
import argparse
import random
import os
import torch
import time
import functools
from maskrcnn_benchmark.config import cfg
from maskrcnn_benchmark.data import make_data_loader
from maskrcnn_benchmark.utils.comm import get_rank
from maskrcnn_benchmark.utils.checkpoint import DetectronCheckpointer
from maskrcnn_benchmark.utils.mlperf_logger import configure_logger
from maskrcnn_benchmark.utils.mlperf_logger import log_start
from maskrcnn_benchmark.modeling.detector import build_detection_model
from maskrcnn_benchmark.solver import make_optimizer
from maskrcnn_benchmark.solver import make_lr_scheduler
from maskrcnn_benchmark.engine.trainer import do_train
from scaleoutbridge import init_bridge, ScaleoutBridge as SBridge

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from mlperf_logging.mllog import constants

In [3]:
import json

In [4]:
from maskrcnn_benchmark.config import cfg

In [5]:
configure_logger(constants.MASKRCNN)
num_gpus = int(os.environ["WORLD_SIZE"]) if "WORLD_SIZE" in os.environ else 1
distributed = num_gpus > 1

In [6]:
parser = argparse.ArgumentParser(description="PyTorch Object Detection Training")
parser.add_argument(
        "--config-file",
        default="",
        metavar="FILE",
        help="path to config file",
        type=str,
    )
parser.add_argument("--local_rank", type=int, default=os.getenv('LOCAL_RANK', 0))
parser.add_argument(
    "opts",
    help="Modify config options using the command-line",
    default=None,
    nargs=argparse.REMAINDER,
)

# args = parser.parse_args()

_StoreAction(option_strings=[], dest='opts', nargs='...', const=None, default=None, type=None, choices=None, help='Modify config options using the command-line', metavar=None)

In [7]:
cfg.merge_from_file("configs/e2e_mask_rcnn_R_50_FPN_1x_1_node_test.yaml")
# cfg.merge_from_list(args.opts)
# cfg.freeze()

In [8]:
num_gpus = int(os.environ["WORLD_SIZE"]) if "WORLD_SIZE" in os.environ else 1
args_distributed = num_gpus >1
if args_distributed:
    torch.cuda.set_device(args.local_rank)
    torch.distributed.init_process_group(
        backend="nccl", init_method="env://"
    )
    world_size = torch.distributed.get_world_size()
    rank = torch.distributed.get_rank()
    # setting seeds - needs to be timed, so after RUN_START
    if is_main_process():
        master_seed = random.SystemRandom().randint(0, 2 ** 32 - 1)
        seed_tensor = torch.tensor(master_seed, dtype=torch.float32, device=torch.device("cuda"))
    else:
        seed_tensor = torch.tensor(0, dtype=torch.float32, device=torch.device("cuda"))

    torch.distributed.broadcast(seed_tensor, 0)
    master_seed = int(seed_tensor.item())
else:
    world_size = 1
    rank = 0
    # random master seed, random.SystemRandom() uses /dev/urandom on Unix
    master_seed = random.SystemRandom().randint(0, 2 ** 32 - 1)

In [9]:
dedicated_evaluation_ranks = max(0,cfg.DEDICATED_EVALUATION_RANKS)
num_training_ranks = world_size - dedicated_evaluation_ranks
num_evaluation_ranks = world_size if dedicated_evaluation_ranks == 0 else dedicated_evaluation_ranks

images_per_gpu_train = cfg.SOLVER.IMS_PER_BATCH // num_training_ranks
images_per_gpu_test = cfg.TEST.IMS_PER_BATCH // num_evaluation_ranks

In [10]:
arguments = {}
arguments["iteration"] = 0
arguments["nhwc"] = cfg.NHWC
arguments['ims_per_batch'] = cfg.SOLVER.IMS_PER_BATCH
arguments["distributed"] = distributed
arguments["max_annotations_per_image"] = cfg.DATALOADER.MAX_ANNOTATIONS_PER_IMAGE
arguments["dedicated_evaluation_ranks"] = dedicated_evaluation_ranks
arguments["num_training_ranks"] = num_training_ranks
arguments["training_comm"] = None if dedicated_evaluation_ranks == 0 else training_comm
arguments["images_per_gpu_train"] = images_per_gpu_train
arguments["use_synthetic_input"] = cfg.DATALOADER.USE_SYNTHETIC_INPUT
assert not (cfg.DATALOADER.USE_SYNTHETIC_INPUT and cfg.DATALOADER.HYBRID), "USE_SYNTHETIC_INPUT and HYBRID can't both be used together"
arguments["enable_nsys_profiling"] = cfg.ENABLE_NSYS_PROFILING
output_dir = cfg.OUTPUT_DIR

# save_to_disk = get_rank() == 0
# checkpointer = DetectronCheckpointer(
#     cfg, model, optimizer, scheduler, output_dir, save_to_disk
# )
# arguments["save_checkpoints"] = cfg.SAVE_CHECKPOINTS

# extra_checkpoint_data = checkpointer.load(cfg.MODEL.WEIGHT, cfg.NHWC)
# arguments.update(extra_checkpoint_data)


In [11]:
import pytorch_lightning as pl
from rcnn_lightning import LightningGeneralizedRCNN

In [12]:
model = LightningGeneralizedRCNN(cfg, arguments)

:::MLLOG {"namespace": "", "time_ms": 1658608097957, "event_type": "POINT_IN_TIME", "key": "weights_initialization", "value": null, "metadata": {"file": "/workspace/maskrcnn/maskrcnn_benchmark/modeling/backbone/fpn.py", "lineno": 56, "tensor": "FPN_inner_block1"}}
:::MLLOG {"namespace": "", "time_ms": 1658608098151, "event_type": "POINT_IN_TIME", "key": "weights_initialization", "value": null, "metadata": {"file": "/workspace/maskrcnn/maskrcnn_benchmark/modeling/backbone/fpn.py", "lineno": 59, "tensor": "FPN_layer_block1"}}
:::MLLOG {"namespace": "", "time_ms": 1658608098155, "event_type": "POINT_IN_TIME", "key": "weights_initialization", "value": null, "metadata": {"file": "/workspace/maskrcnn/maskrcnn_benchmark/modeling/backbone/fpn.py", "lineno": 56, "tensor": "FPN_inner_block2"}}
:::MLLOG {"namespace": "", "time_ms": 1658608098168, "event_type": "POINT_IN_TIME", "key": "weights_initialization", "value": null, "metadata": {"file": "/workspace/maskrcnn/maskrcnn_benchmark/modeling/bac



In [13]:
model.to('cuda')

LightningGeneralizedRCNN(
  (model): GeneralizedRCNN(
    (graphable): Graphable(
      (backbone): Sequential(
        (body): ResNet(
          (stem): StemWithFixedBatchNorm(
            (_base_stem): _BaseStem(
              (conv1): RecursiveScriptModule(original_name=Conv2d_NHWC)
              (bn1): FrozenBatchNorm2d_NHWC()
            )
            (max_pool): MaxPool2d_NHWC(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
          )
          (layer1): Sequential(
            (0): FastBottleneckWithFixedBatchNorm(
              (downsample): Sequential(
                (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
                (1): FrozenBatchNorm2d()
              )
              (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
              (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
              (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
 

In [14]:
list(model.parameters())[0].device

device(type='cuda', index=0)

In [15]:
def custom_configure_optimizers(optim_conf):
    optimizers, lr_schedulers, optimizer_frequencies = [], [], []
    monitor = None
    optimizers = [optim_conf]
    return optimizers, lr_schedulers, optimizer_frequencies, monitor

In [16]:
def custom_init_optimizers_and_lr_schedulers(
    model: "pl.LightningModule",
):
    """Calls `LightningModule.configure_optimizers` and parses and validates the output."""
    assert model.trainer is not None
    optim_conf = model.trainer._call_lightning_module_hook("configure_optimizers", pl_module=model)

    if optim_conf is None:
        rank_zero_warn(
            "`LightningModule.configure_optimizers` returned `None`, this fit will run with no optimizer",
        )
        optim_conf = _MockOptimizer()

    optimizers, lr_schedulers, optimizer_frequencies, monitor = custom_configure_optimizers(optim_conf)
    lr_scheduler_configs = (
        _configure_schedulers_automatic_opt(lr_schedulers, monitor)
        if model.automatic_optimization
        else _configure_schedulers_manual_opt(lr_schedulers)
    )
    _set_scheduler_opt_idx(optimizers, lr_scheduler_configs)
    _validate_scheduler_api(lr_scheduler_configs, model)
    return optimizers, lr_scheduler_configs, optimizer_frequencies

In [17]:
from pytorch_lightning.core.optimizer import _configure_schedulers_automatic_opt, _set_scheduler_opt_idx, _validate_scheduler_api, _configure_schedulers_manual_opt

In [18]:
from pytorch_lightning.strategies import Strategy
from pytorch_lightning.core.optimizer import _init_optimizers_and_lr_schedulers
from pytorch_lightning.trainer.states import TrainerFn

class CustomStrategy(Strategy):
    
    def __init__(self, 
        device = "cuda",
        accelerator = None,
        checkpoint_io = None,
        precision_plugin = None,):
        
        super().__init__(accelerator=accelerator, checkpoint_io=checkpoint_io, precision_plugin=precision_plugin)
        self._root_device = torch.device(device)
        self.global_rank = os.environ.get("RANK", 0)
        self.root_device = torch.device("cuda:0")

        print("DKLJF", self.root_device)
        print("LSKDJ", self._root_device)
        
#     def backward(self, closure_loss, * args, ** kwargs):
#         pass
    
#     def batch_to_device(self, batch, device = None, dataloader_idx = 0):
#         pass
    
#     def connect(self, model):
#         pass
    
#     def dispatch(self, trainer):
#         pass
    
#     def lightning_module_state_dict(self, ):
#         pass
    
#     def model_sharded_context(self, ):
#         yield
    
#     def on_predict_end(self, ):
#         pass
    
#     def on_predict_start(self, ):
#         pass
    
#     def on_test_end(self, ):
#         pass
        
#     def on_test_start(self, ):
#         pass
    
#     def on_train_batch_start(self, batch, batch_idx, dataloader_idx = 0):
#         pass
    
#     def on_train_end(self, ):
#         pass
    
#      def on_train_start(self, ):
#         pass
    
#     def on_validation_end(self, ):
#         pass
    
#     def on_validation_start(self, ):
#         pass
    
#     def optimizer_state(self, optimizer):
#         pass
    
#     def optimizer_step(self, optimizer, opt_idx, closure, model = None, ** kwargs):
#         pass
    
#     def post_backward(self, closure_loss):
#         pass
    
#     def post_dispatch(self, trainer):
#         pass
    
#     def pre_backward(self, closure_loss):
#         pass
    
#     def predict_step(self, * args, ** kwargs):
#         pass
    
#     def process_dataloader(self, dataloader):
#         pass
    
#     def reduce_boolean_decision(self, decision):
#         pass
    
#     def remove_checkpoint(self, filepath):
#         pass
    
#     def save_checkpoint(self, checkpoint, filepath, storage_options = None):
#         pass
    
# #     def setup(self, trainer):
# #         pass
    
#     def setup_environment(self, ):
#         pass
    
    def setup_optimizers(self, trainer):
        if trainer.state.fn not in (TrainerFn.FITTING, TrainerFn.TUNING):
            return
        self.optimizers, self.lr_scheduler_configs, self.optimizer_frequencies = custom_init_optimizers_and_lr_schedulers(
            self.lightning_module
        )
    
#     def setup_precision_plugin(self, ):
#         pass
    
#     def teardown(self, ):
#         pass
    
#     def test_step(self, * args, ** kwargs):
#         pass
    
# #     def training_step(self, * args, ** kwargs):
# #         pass
    
#     def validation_step(self, * args, ** kwargs):
#         pass
    
    def all_gather(self, tensor, group = None, sync_grads = False):
        pass
    
    def barrier(self, name = None):
        pass
    
    def broadcast(self, obj, src = 0):
        return obj
#         pass
    
    def is_global_zero(self, ):
        return True
#         pass
    
    def model_to_device(self, ):
        assert self.model is not None, "self.model must be set before self.model.to()"
        self.model.to(self.root_device)
    
    def reduce(self, tensor, group = None, reduce_op = 'mean'):
        pass
    
    def root_device(self) -> torch.device:
        return self._root_device

In [19]:
trainer=pl.Trainer(accelerator="gpu", devices=1, strategy = CustomStrategy(), enable_checkpointing=False)
# trainer=pl.Trainer(accelerator="gpu", devices=1, strategy=None)
# trainer=pl.Trainer(strategy = CustomStrategy(), enable_checkpointing=False)

  rank_zero_warn(
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


DKLJF cuda:0
LSKDJ cuda


In [20]:
trainer.fit(model)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]

  | Name  | Type            | Params
------------------------------------------
0 | model | GeneralizedRCNN | 44.3 M
------------------------------------------
44.1 M    Trainable params
222 K     Non-trainable params
44.3 M    Total params
177.390   Total estimated model params size (MB)


[(800, 1344), (1344, 800)]
loading annotations into memory...
Done (t=12.39s)
creating index...
index created!
shapes=[(800, 1344), (1344, 800)]
passthrough=False
Epoch 0: : 0it [00:00, ?it/s]TRAININGSTEP
tensor(39.3816, device='cuda:0', grad_fn=<AddBackward0>)
[ERROR] Exception CUDNN_BACKEND_EXECUTION_PLAN_DESCRIPTOR: cudnnFinalize Descriptor Failed cudnn_status: CUDNN_STATUS_NOT_SUPPORTED
Epoch 0: : 1it [01:07, 67.80s/it, v_num=8]TRAININGSTEP
tensor(80.7464, device='cuda:0', grad_fn=<AddBackward0>)
[ERROR] Exception CUDNN_BACKEND_EXECUTION_PLAN_DESCRIPTOR: cudnnFinalize Descriptor Failed cudnn_status: CUDNN_STATUS_NOT_SUPPORTED
[ERROR] Exception CUDNN_BACKEND_EXECUTION_PLAN_DESCRIPTOR: cudnnFinalize Descriptor Failed cudnn_status: CUDNN_STATUS_NOT_SUPPORTED
[ERROR] Exception CUDNN_BACKEND_EXECUTION_PLAN_DESCRIPTOR: cudnnFinalize Descriptor Failed cudnn_status: CUDNN_STATUS_NOT_SUPPORTED
[ERROR] Exception CUDNN_BACKEND_EXECUTION_PLAN_DESCRIPTOR: cudnnFinalize Descriptor Failed cudnn_s

Exception in thread Thread-5:
Traceback (most recent call last):
  File "/opt/conda/lib/python3.8/threading.py", line 932, in _bootstrap_inner
    self.run()
  File "/opt/conda/lib/python3.8/threading.py", line 870, in run
    self._target(*self._args, **self._kwargs)
  File "/opt/conda/lib/python3.8/site-packages/torch/utils/data/_utils/pin_memory.py", line 28, in _pin_memory_loop
    r = in_queue.get(timeout=MP_STATUS_CHECK_INTERVAL)
  File "/opt/conda/lib/python3.8/multiprocessing/queues.py", line 116, in get
    return _ForkingPickler.loads(res)
  File "/opt/conda/lib/python3.8/site-packages/torch/multiprocessing/reductions.py", line 295, in rebuild_storage_fd
    fd = df.detach()
  File "/opt/conda/lib/python3.8/multiprocessing/resource_sharer.py", line 57, in detach
    with _resource_sharer.get_connection(self._id) as conn:
  File "/opt/conda/lib/python3.8/multiprocessing/resource_sharer.py", line 87, in get_connection
    c = Client(address, authkey=process.current_process().au

[ERROR] Exception CUDNN_BACKEND_EXECUTION_PLAN_DESCRIPTOR: cudnnFinalize Descriptor Failed cudnn_status: CUDNN_STATUS_NOT_SUPPORTED


  rank_zero_warn("Detected KeyboardInterrupt, attempting graceful shutdown...")


In [None]:
x, y = next(model.prefetcher)

In [23]:
x

<maskrcnn_benchmark.structures.image_list.ImageList at 0x7f6bfb8e6f10>

In [24]:
y

[BoxList(num_boxes=2, image_width=1199, image_height=800, mode=xyxy)]