From a5ce55ba0255d95923c5282d5eacabacf74f27a4 Mon Sep 17 00:00:00 2001 From: Jinlu Zhang Date: Fri, 14 Oct 2022 11:53:37 +0800 Subject: [PATCH] [Feature] support DEKR (#1693) --- .../body/2d_kpt_sview_rgb_img/dekr/README.md | 22 + .../dekr/coco/hrnet_coco.md | 78 +++ .../dekr/coco/hrnet_coco.yml | 73 +++ .../dekr/coco/hrnet_w32_coco_512x512.py | 196 ++++++++ .../coco/hrnet_w32_coco_512x512_multiscale.py | 41 ++ .../dekr/coco/hrnet_w48_coco_640x640.py | 196 ++++++++ .../coco/hrnet_w48_coco_640x640_multiscale.py | 39 ++ .../dekr/crowdpose/hrnet_crowdpose.md | 65 +++ .../dekr/crowdpose/hrnet_crowdpose.yml | 73 +++ .../crowdpose/hrnet_w32_crowdpose_512x512.py | 195 ++++++++ .../hrnet_w32_crowdpose_512x512_multiscale.py | 40 ++ .../crowdpose/hrnet_w48_crowdpose_640x640.py | 195 ++++++++ .../hrnet_w48_crowdpose_640x640_multiscale.py | 40 ++ docs/en/papers/algorithms/dekr.md | 31 ++ mmpose/core/evaluation/bottom_up_eval.py | 17 +- mmpose/core/post_processing/__init__.py | 10 +- mmpose/core/post_processing/group.py | 139 ++++++ mmpose/core/post_processing/nms.py | 74 ++- .../core/post_processing/post_transforms.py | 2 +- .../datasets/pipelines/bottom_up_transform.py | 329 ++++++++++--- mmpose/models/detectors/__init__.py | 4 +- mmpose/models/detectors/one_stage.py | 449 ++++++++++++++++++ mmpose/models/heads/__init__.py | 3 +- mmpose/models/heads/dekr_head.py | 245 ++++++++++ mmpose/models/losses/__init__.py | 28 +- mmpose/models/losses/mse_loss.py | 8 +- mmpose/models/losses/regression_loss.py | 77 +++ mmpose/models/utils/__init__.py | 3 +- mmpose/models/utils/rescore.py | 77 +++ model-index.yml | 2 + tests/test_evaluation/test_bottom_up_eval.py | 6 + tests/test_losses/test_regression_losses.py | 41 ++ tests/test_models/test_bottom_up_head.py | 42 +- tests/test_models/test_one_stage_forward.py | 167 +++++++ .../test_bottom_up_pipelines.py | 113 ++++- tests/test_post_processing/test_nms.py | 34 +- 36 files changed, 3053 insertions(+), 101 deletions(-) create mode 100644 configs/body/2d_kpt_sview_rgb_img/dekr/README.md create mode 100644 configs/body/2d_kpt_sview_rgb_img/dekr/coco/hrnet_coco.md create mode 100644 configs/body/2d_kpt_sview_rgb_img/dekr/coco/hrnet_coco.yml create mode 100644 configs/body/2d_kpt_sview_rgb_img/dekr/coco/hrnet_w32_coco_512x512.py create mode 100644 configs/body/2d_kpt_sview_rgb_img/dekr/coco/hrnet_w32_coco_512x512_multiscale.py create mode 100644 configs/body/2d_kpt_sview_rgb_img/dekr/coco/hrnet_w48_coco_640x640.py create mode 100644 configs/body/2d_kpt_sview_rgb_img/dekr/coco/hrnet_w48_coco_640x640_multiscale.py create mode 100644 configs/body/2d_kpt_sview_rgb_img/dekr/crowdpose/hrnet_crowdpose.md create mode 100644 configs/body/2d_kpt_sview_rgb_img/dekr/crowdpose/hrnet_crowdpose.yml create mode 100644 configs/body/2d_kpt_sview_rgb_img/dekr/crowdpose/hrnet_w32_crowdpose_512x512.py create mode 100644 configs/body/2d_kpt_sview_rgb_img/dekr/crowdpose/hrnet_w32_crowdpose_512x512_multiscale.py create mode 100644 configs/body/2d_kpt_sview_rgb_img/dekr/crowdpose/hrnet_w48_crowdpose_640x640.py create mode 100644 configs/body/2d_kpt_sview_rgb_img/dekr/crowdpose/hrnet_w48_crowdpose_640x640_multiscale.py create mode 100644 docs/en/papers/algorithms/dekr.md create mode 100644 mmpose/models/detectors/one_stage.py create mode 100644 mmpose/models/heads/dekr_head.py create mode 100644 mmpose/models/utils/rescore.py create mode 100644 tests/test_models/test_one_stage_forward.py diff --git a/configs/body/2d_kpt_sview_rgb_img/dekr/README.md b/configs/body/2d_kpt_sview_rgb_img/dekr/README.md new file mode 100644 index 0000000000..04726421c0 --- /dev/null +++ b/configs/body/2d_kpt_sview_rgb_img/dekr/README.md @@ -0,0 +1,22 @@ +# Bottom-up Human Pose Estimation via Disentangled Keypoint Regression (DEKR) + + + +
+DEKR (CVPR'2021) + +```bibtex +@inproceedings{geng2021bottom, + title={Bottom-up human pose estimation via disentangled keypoint regression}, + author={Geng, Zigang and Sun, Ke and Xiao, Bin and Zhang, Zhaoxiang and Wang, Jingdong}, + booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition}, + pages={14676--14686}, + year={2021} +} +``` + +
+ +DEKR is a popular 2D bottom-up pose estimation approach that simultaneously detects all the instances and regresses the offsets from the instance centers to joints. + +In order to predict the offsets more accurately, the offsets of different joints are regressed using separated branches with deformable convolutional layers. Thus convolution kernels with different shapes are adopted to extract features for the corresponding joint. diff --git a/configs/body/2d_kpt_sview_rgb_img/dekr/coco/hrnet_coco.md b/configs/body/2d_kpt_sview_rgb_img/dekr/coco/hrnet_coco.md new file mode 100644 index 0000000000..3ff4cce9d1 --- /dev/null +++ b/configs/body/2d_kpt_sview_rgb_img/dekr/coco/hrnet_coco.md @@ -0,0 +1,78 @@ + + +
+DEKR (CVPR'2021) + +```bibtex +@inproceedings{geng2021bottom, + title={Bottom-up human pose estimation via disentangled keypoint regression}, + author={Geng, Zigang and Sun, Ke and Xiao, Bin and Zhang, Zhaoxiang and Wang, Jingdong}, + booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition}, + pages={14676--14686}, + year={2021} +} +``` + +
+ + + +
+HRNet (CVPR'2019) + +```bibtex +@inproceedings{sun2019deep, + title={Deep high-resolution representation learning for human pose estimation}, + author={Sun, Ke and Xiao, Bin and Liu, Dong and Wang, Jingdong}, + booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition}, + pages={5693--5703}, + year={2019} +} +``` + +
+ + + +
+COCO (ECCV'2014) + +```bibtex +@inproceedings{lin2014microsoft, + title={Microsoft coco: Common objects in context}, + author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence}, + booktitle={European conference on computer vision}, + pages={740--755}, + year={2014}, + organization={Springer} +} +``` + +
+ +Results on COCO val2017 without multi-scale test + +| Arch | Input Size | AP | AP50 | AP75 | AR | AR50 | ckpt | log | +| :-------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :-------------------------------------------: | :-------------------------------------------: | +| [HRNet-w32](/configs/body/2d_kpt_sview_rgb_img/disentangled_keypoint_regression/coco/hrnet_w32_coco_512x512.py) | 512x512 | 0.680 | 0.868 | 0.745 | 0.728 | 0.897 | [ckpt](https://download.openmmlab.com/mmpose/bottom_up/dekr/hrnet_w32_coco_512x512-2a3056de_20220928.pth) | [log](https://download.openmmlab.com/mmpose/bottom_up/dekr/hrnet_w32_coco_512x512-20220928.log.json) | +| [HRNet-w48](/configs/body/2d_kpt_sview_rgb_img/disentangled_keypoint_regression/coco/hrnet_w48_coco_640x640.py) | 640x640 | 0.709 | 0.876 | 0.773 | 0.758 | 0.909 | [ckpt](https://download.openmmlab.com/mmpose/bottom_up/dekr/hrnet_w48_coco_640x640-8854b2f1_20220930.pth) | [log](https://download.openmmlab.com/mmpose/bottom_up/dekr/hrnet_w48_coco_640x640-20220930.log.json) | + +Results on COCO val2017 with multi-scale test. 3 default scales (\[2, 1, 0.5\]) are used + +| Arch | Input Size | AP | AP50 | AP75 | AR | AR50 | ckpt | +| :------------------------------------------------------------------ | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :------------------------------------------------------------------: | +| [HRNet-w32](/configs/body/2d_kpt_sview_rgb_img/disentangled_keypoint_regression/coco/hrnet_w32_coco_512x512_multiscale.py)\* | 512x512 | 0.705 | 0.878 | 0.767 | 0.759 | 0.921 | [ckpt](https://download.openmmlab.com/mmpose/bottom_up/dekr/hrnet_w32_coco_512x512-2a3056de_20220928.pth) | +| [HRNet-w48](/configs/body/2d_kpt_sview_rgb_img/disentangled_keypoint_regression/coco/hrnet_w48_coco_640x640_multiscale.py)\* | 640x640 | 0.722 | 0.882 | 0.785 | 0.778 | 0.928 | [ckpt](https://download.openmmlab.com/mmpose/bottom_up/dekr/hrnet_w48_coco_640x640-8854b2f1_20220930.pth) | + +\* these configs are generally used for evaluation. The training settings are identical to their single-scale counterparts. + +The results of models provided by the authors on COCO val2017 using the same evaluation protocol + +| Arch | Input Size | Setting | AP | AP50 | AP75 | AR | AR50 | ckpt | +| :-------- | :--------: | :----------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :----------------------------------------------------------: | +| HRNet-w32 | 512x512 | single-scale | 0.678 | 0.868 | 0.744 | 0.728 | 0.897 | see [official implementation](https://github.com/HRNet/DEKR) | +| HRNet-w48 | 640x640 | single-scale | 0.707 | 0.876 | 0.773 | 0.757 | 0.909 | see [official implementation](https://github.com/HRNet/DEKR) | +| HRNet-w32 | 512x512 | multi-scale | 0.708 | 0.880 | 0.773 | 0.763 | 0.921 | see [official implementation](https://github.com/HRNet/DEKR) | +| HRNet-w48 | 640x640 | multi-scale | 0.721 | 0.881 | 0.786 | 0.779 | 0.927 | see [official implementation](https://github.com/HRNet/DEKR) | + +The discrepancy between these results and that shown in paper is attributed to the differences in implementation details in evaluation process. diff --git a/configs/body/2d_kpt_sview_rgb_img/dekr/coco/hrnet_coco.yml b/configs/body/2d_kpt_sview_rgb_img/dekr/coco/hrnet_coco.yml new file mode 100644 index 0000000000..b2b708a891 --- /dev/null +++ b/configs/body/2d_kpt_sview_rgb_img/dekr/coco/hrnet_coco.yml @@ -0,0 +1,73 @@ +Collections: +- Name: DEKR + Paper: + Title: Bottom-up human pose estimation via disentangled keypoint regression + URL: https://arxiv.org/abs/2104.02300 + README: https://github.com/open-mmlab/mmpose/blob/master/docs/en/papers/algorithms/dekr.md +Models: +- Config: configs/body/2d_kpt_sview_rgb_img/disentangled_keypoint_regression/coco/hrnet_w32_coco_512x512.py + In Collection: DEKR + Metadata: + Architecture: &id001 + - DEKR + - HRNet + Training Data: COCO + Name: disentangled_keypoint_regression_hrnet_w32_coco_512x512 + Results: + - Dataset: COCO + Metrics: + AP: 0.68 + AP@0.5: 0.868 + AP@0.75: 0.745 + AR: 0.728 + AR@0.5: 0.897 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/bottom_up/dekr/hrnet_w32_coco_512x512-2a3056de_20220928.pth +- Config: configs/body/2d_kpt_sview_rgb_img/disentangled_keypoint_regression/coco/hrnet_w48_coco_640x640.py + In Collection: DEKR + Metadata: + Architecture: *id001 + Training Data: COCO + Name: disentangled_keypoint_regression_hrnet_w48_coco_640x640 + Results: + - Dataset: COCO + Metrics: + AP: 0.709 + AP@0.5: 0.876 + AP@0.75: 0.773 + AR: 0.758 + AR@0.5: 0.909 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/bottom_up/dekr/hrnet_w48_coco_640x640-8854b2f1_20220930.pth +- Config: configs/body/2d_kpt_sview_rgb_img/disentangled_keypoint_regression/coco/hrnet_w32_coco_512x512_multiscale.py + In Collection: DEKR + Metadata: + Architecture: *id001 + Training Data: COCO + Name: disentangled_keypoint_regression_hrnet_w32_coco_512x512_multiscale + Results: + - Dataset: COCO + Metrics: + AP: 0.705 + AP@0.5: 0.878 + AP@0.75: 0.767 + AR: 0.759 + AR@0.5: 0.921 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/bottom_up/dekr/hrnet_w32_coco_512x512-2a3056de_20220928.pth +- Config: configs/body/2d_kpt_sview_rgb_img/disentangled_keypoint_regression/coco/hrnet_w48_coco_640x640_multiscale.py + In Collection: DEKR + Metadata: + Architecture: *id001 + Training Data: COCO + Name: disentangled_keypoint_regression_hrnet_w48_coco_640x640_multiscale + Results: + - Dataset: COCO + Metrics: + AP: 0.722 + AP@0.5: 0.882 + AP@0.75: 0.785 + AR: 0.778 + AR@0.5: 0.928 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/bottom_up/dekr/hrnet_w48_coco_640x640-8854b2f1_20220930.pth diff --git a/configs/body/2d_kpt_sview_rgb_img/dekr/coco/hrnet_w32_coco_512x512.py b/configs/body/2d_kpt_sview_rgb_img/dekr/coco/hrnet_w32_coco_512x512.py new file mode 100644 index 0000000000..8f2b95ac5d --- /dev/null +++ b/configs/body/2d_kpt_sview_rgb_img/dekr/coco/hrnet_w32_coco_512x512.py @@ -0,0 +1,196 @@ +_base_ = [ + '../../../../_base_/default_runtime.py', + '../../../../_base_/datasets/coco.py' +] +checkpoint_config = dict(interval=20) +evaluation = dict(interval=20, metric='mAP', save_best='AP') + +optimizer = dict( + type='Adam', + lr=0.001, +) +optimizer_config = dict(grad_clip=None) +# learning policy +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=500, + warmup_ratio=0.001, + step=[90, 120]) +total_epochs = 140 +channel_cfg = dict( + dataset_joints=17, + dataset_channel=[ + [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16], + ], + inference_channel=[ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 + ]) + +data_cfg = dict( + image_size=512, + base_size=256, + base_sigma=2, + heatmap_size=[128], + num_joints=channel_cfg['dataset_joints'], + dataset_channel=channel_cfg['dataset_channel'], + inference_channel=channel_cfg['inference_channel'], + num_scales=1, + scale_aware_sigma=False, +) + +# model settings +model = dict( + type='DisentangledKeypointRegressor', + pretrained='https://download.openmmlab.com/mmpose/' + 'pretrain_models/hrnet_w32-36af842e.pth', + backbone=dict( + type='HRNet', + in_channels=3, + extra=dict( + stage1=dict( + num_modules=1, + num_branches=1, + block='BOTTLENECK', + num_blocks=(4, ), + num_channels=(64, )), + stage2=dict( + num_modules=1, + num_branches=2, + block='BASIC', + num_blocks=(4, 4), + num_channels=(32, 64)), + stage3=dict( + num_modules=4, + num_branches=3, + block='BASIC', + num_blocks=(4, 4, 4), + num_channels=(32, 64, 128)), + stage4=dict( + num_modules=3, + num_branches=4, + block='BASIC', + num_blocks=(4, 4, 4, 4), + num_channels=(32, 64, 128, 256), + multiscale_output=True)), + ), + keypoint_head=dict( + type='DEKRHead', + in_channels=(32, 64, 128, 256), + in_index=(0, 1, 2, 3), + num_heatmap_filters=32, + num_joints=channel_cfg['dataset_joints'], + input_transform='resize_concat', + heatmap_loss=dict( + type='JointsMSELoss', + use_target_weight=True, + loss_weight=1.0, + ), + offset_loss=dict( + type='SoftWeightSmoothL1Loss', + use_target_weight=True, + supervise_empty=False, + loss_weight=0.002, + beta=1 / 9.0, + )), + train_cfg=dict(), + test_cfg=dict( + num_joints=channel_cfg['dataset_joints'], + max_num_people=30, + project2image=False, + align_corners=False, + max_pool_kernel=5, + use_nms=True, + nms_dist_thr=0.05, + nms_joints_thr=8, + keypoint_threshold=0.01, + rescore_cfg=dict( + in_channels=74, + norm_indexes=(5, 6), + pretrained='https://download.openmmlab.com/mmpose/' + 'pretrain_models/kpt_rescore_coco-33d58c5c.pth'), + flip_test=True)) + +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='BottomUpRandomAffine', + rot_factor=30, + scale_factor=[0.75, 1.5], + scale_type='short', + trans_factor=40), + dict(type='BottomUpRandomFlip', flip_prob=0.5), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict(type='GetKeypointCenterArea'), + dict( + type='BottomUpGenerateHeatmapTarget', + sigma=(2, 4), + gen_center_heatmap=True, + bg_weight=0.1, + ), + dict( + type='BottomUpGenerateOffsetTarget', + radius=4, + ), + dict( + type='Collect', + keys=['img', 'heatmaps', 'masks', 'offsets', 'offset_weights'], + meta_keys=[]), +] + +val_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='BottomUpGetImgSize', test_scale_factor=[1]), + dict( + type='BottomUpResizeAlign', + transforms=[ + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + ]), + dict( + type='Collect', + keys=['img'], + meta_keys=[ + 'image_file', 'aug_data', 'test_scale_factor', 'base_size', + 'center', 'scale', 'flip_index', 'num_joints', 'skeleton', + 'image_size', 'heatmap_size' + ]), +] + +test_pipeline = val_pipeline + +data_root = 'data/coco' +data = dict( + workers_per_gpu=4, + train_dataloader=dict(samples_per_gpu=10), + val_dataloader=dict(samples_per_gpu=1), + test_dataloader=dict(samples_per_gpu=1), + train=dict( + type='BottomUpCocoDataset', + ann_file=f'{data_root}/annotations/person_keypoints_train2017.json', + img_prefix=f'{data_root}/train2017/', + data_cfg=data_cfg, + pipeline=train_pipeline, + dataset_info={{_base_.dataset_info}}), + val=dict( + type='BottomUpCocoDataset', + ann_file=f'{data_root}/annotations/person_keypoints_val2017.json', + img_prefix=f'{data_root}/val2017/', + data_cfg=data_cfg, + pipeline=val_pipeline, + dataset_info={{_base_.dataset_info}}), + test=dict( + type='BottomUpCocoDataset', + ann_file=f'{data_root}/annotations/person_keypoints_val2017.json', + img_prefix=f'{data_root}/val2017/', + data_cfg=data_cfg, + pipeline=test_pipeline, + dataset_info={{_base_.dataset_info}}), +) diff --git a/configs/body/2d_kpt_sview_rgb_img/dekr/coco/hrnet_w32_coco_512x512_multiscale.py b/configs/body/2d_kpt_sview_rgb_img/dekr/coco/hrnet_w32_coco_512x512_multiscale.py new file mode 100644 index 0000000000..292ad07651 --- /dev/null +++ b/configs/body/2d_kpt_sview_rgb_img/dekr/coco/hrnet_w32_coco_512x512_multiscale.py @@ -0,0 +1,41 @@ +_base_ = ['hrnet_w32_coco_512x512.py'] + +model = dict( + test_cfg=dict( + multi_scale_score_decrease=1.0, + nms_dist_thr=0.1, + max_pool_kernel=9, + )) + +val_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='BottomUpGetImgSize', + base_length=32, + test_scale_factor=[0.5, 1, 2]), + dict( + type='BottomUpResizeAlign', + base_length=32, + transforms=[ + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + ]), + dict( + type='Collect', + keys=['img'], + meta_keys=[ + 'image_file', 'aug_data', 'test_scale_factor', 'base_size', + 'center', 'scale', 'flip_index', 'num_joints', 'skeleton', + 'image_size', 'heatmap_size' + ]), +] + +test_pipeline = val_pipeline + +data = dict( + val=dict(pipeline=val_pipeline), + test=dict(pipeline=test_pipeline), +) diff --git a/configs/body/2d_kpt_sview_rgb_img/dekr/coco/hrnet_w48_coco_640x640.py b/configs/body/2d_kpt_sview_rgb_img/dekr/coco/hrnet_w48_coco_640x640.py new file mode 100644 index 0000000000..e9ec2ba574 --- /dev/null +++ b/configs/body/2d_kpt_sview_rgb_img/dekr/coco/hrnet_w48_coco_640x640.py @@ -0,0 +1,196 @@ +_base_ = [ + '../../../../_base_/default_runtime.py', + '../../../../_base_/datasets/coco.py' +] +checkpoint_config = dict(interval=20) +evaluation = dict(interval=20, metric='mAP', save_best='AP') + +optimizer = dict( + type='Adam', + lr=0.001, +) +optimizer_config = dict(grad_clip=None) +# learning policy +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=500, + warmup_ratio=0.001, + step=[90, 120]) +total_epochs = 140 +channel_cfg = dict( + dataset_joints=17, + dataset_channel=[ + [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16], + ], + inference_channel=[ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 + ]) + +data_cfg = dict( + image_size=640, + base_size=320, + base_sigma=2, + heatmap_size=[160], + num_joints=channel_cfg['dataset_joints'], + dataset_channel=channel_cfg['dataset_channel'], + inference_channel=channel_cfg['inference_channel'], + num_scales=1, + scale_aware_sigma=False, +) + +# model settings +model = dict( + type='DisentangledKeypointRegressor', + pretrained='https://download.openmmlab.com/mmpose/' + 'pretrain_models/hrnet_w48-8ef0771d.pth', + backbone=dict( + type='HRNet', + in_channels=3, + extra=dict( + stage1=dict( + num_modules=1, + num_branches=1, + block='BOTTLENECK', + num_blocks=(4, ), + num_channels=(64, )), + stage2=dict( + num_modules=1, + num_branches=2, + block='BASIC', + num_blocks=(4, 4), + num_channels=(48, 96)), + stage3=dict( + num_modules=4, + num_branches=3, + block='BASIC', + num_blocks=(4, 4, 4), + num_channels=(48, 96, 192)), + stage4=dict( + num_modules=3, + num_branches=4, + block='BASIC', + num_blocks=(4, 4, 4, 4), + num_channels=(48, 96, 192, 384), + multiscale_output=True)), + ), + keypoint_head=dict( + type='DEKRHead', + in_channels=(48, 96, 192, 384), + in_index=(0, 1, 2, 3), + num_heatmap_filters=48, + num_joints=channel_cfg['dataset_joints'], + input_transform='resize_concat', + heatmap_loss=dict( + type='JointsMSELoss', + use_target_weight=True, + loss_weight=1.0, + ), + offset_loss=dict( + type='SoftWeightSmoothL1Loss', + use_target_weight=True, + supervise_empty=False, + loss_weight=0.002, + beta=1 / 9.0, + )), + train_cfg=dict(), + test_cfg=dict( + num_joints=channel_cfg['dataset_joints'], + max_num_people=30, + project2image=False, + align_corners=False, + max_pool_kernel=5, + use_nms=True, + nms_dist_thr=0.05, + nms_joints_thr=8, + keypoint_threshold=0.01, + rescore_cfg=dict( + in_channels=74, + norm_indexes=(5, 6), + pretrained='https://download.openmmlab.com/mmpose/' + 'pretrain_models/kpt_rescore_coco-33d58c5c.pth'), + flip_test=True)) + +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='BottomUpRandomAffine', + rot_factor=30, + scale_factor=[0.75, 1.5], + scale_type='short', + trans_factor=40), + dict(type='BottomUpRandomFlip', flip_prob=0.5), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict(type='GetKeypointCenterArea'), + dict( + type='BottomUpGenerateHeatmapTarget', + sigma=(2, 4), + gen_center_heatmap=True, + bg_weight=0.1, + ), + dict( + type='BottomUpGenerateOffsetTarget', + radius=4, + ), + dict( + type='Collect', + keys=['img', 'heatmaps', 'masks', 'offsets', 'offset_weights'], + meta_keys=[]), +] + +val_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='BottomUpGetImgSize', test_scale_factor=[1]), + dict( + type='BottomUpResizeAlign', + transforms=[ + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + ]), + dict( + type='Collect', + keys=['img'], + meta_keys=[ + 'image_file', 'aug_data', 'test_scale_factor', 'base_size', + 'center', 'scale', 'flip_index', 'num_joints', 'skeleton', + 'image_size', 'heatmap_size' + ]), +] + +test_pipeline = val_pipeline + +data_root = 'data/coco' +data = dict( + workers_per_gpu=4, + train_dataloader=dict(samples_per_gpu=5), + val_dataloader=dict(samples_per_gpu=1), + test_dataloader=dict(samples_per_gpu=1), + train=dict( + type='BottomUpCocoDataset', + ann_file=f'{data_root}/annotations/person_keypoints_train2017.json', + img_prefix=f'{data_root}/train2017/', + data_cfg=data_cfg, + pipeline=train_pipeline, + dataset_info={{_base_.dataset_info}}), + val=dict( + type='BottomUpCocoDataset', + ann_file=f'{data_root}/annotations/person_keypoints_val2017.json', + img_prefix=f'{data_root}/val2017/', + data_cfg=data_cfg, + pipeline=val_pipeline, + dataset_info={{_base_.dataset_info}}), + test=dict( + type='BottomUpCocoDataset', + ann_file=f'{data_root}/annotations/person_keypoints_val2017.json', + img_prefix=f'{data_root}/val2017/', + data_cfg=data_cfg, + pipeline=test_pipeline, + dataset_info={{_base_.dataset_info}}), +) diff --git a/configs/body/2d_kpt_sview_rgb_img/dekr/coco/hrnet_w48_coco_640x640_multiscale.py b/configs/body/2d_kpt_sview_rgb_img/dekr/coco/hrnet_w48_coco_640x640_multiscale.py new file mode 100644 index 0000000000..60eae17fd7 --- /dev/null +++ b/configs/body/2d_kpt_sview_rgb_img/dekr/coco/hrnet_w48_coco_640x640_multiscale.py @@ -0,0 +1,39 @@ +_base_ = ['hrnet_w48_coco_640x640.py'] + +model = dict(test_cfg=dict( + nms_dist_thr=0.1, + max_pool_kernel=11, +)) + +val_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='BottomUpGetImgSize', + base_length=32, + test_scale_factor=[0.5, 1, 2]), + dict( + type='BottomUpResizeAlign', + base_length=32, + transforms=[ + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + ]), + dict( + type='Collect', + keys=['img'], + meta_keys=[ + 'image_file', 'aug_data', 'test_scale_factor', 'base_size', + 'center', 'scale', 'flip_index', 'num_joints', 'skeleton', + 'image_size', 'heatmap_size' + ]), +] + +test_pipeline = val_pipeline + +data = dict( + val=dict(pipeline=val_pipeline), + test=dict(pipeline=test_pipeline), +) diff --git a/configs/body/2d_kpt_sview_rgb_img/dekr/crowdpose/hrnet_crowdpose.md b/configs/body/2d_kpt_sview_rgb_img/dekr/crowdpose/hrnet_crowdpose.md new file mode 100644 index 0000000000..c2bcaaeabb --- /dev/null +++ b/configs/body/2d_kpt_sview_rgb_img/dekr/crowdpose/hrnet_crowdpose.md @@ -0,0 +1,65 @@ + + +
+DEKR (CVPR'2021) + +```bibtex +@inproceedings{geng2021bottom, + title={Bottom-up human pose estimation via disentangled keypoint regression}, + author={Geng, Zigang and Sun, Ke and Xiao, Bin and Zhang, Zhaoxiang and Wang, Jingdong}, + booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition}, + pages={14676--14686}, + year={2021} +} +``` + +
+ + + +
+HRNet (CVPR'2019) + +```bibtex +@inproceedings{sun2019deep, + title={Deep high-resolution representation learning for human pose estimation}, + author={Sun, Ke and Xiao, Bin and Liu, Dong and Wang, Jingdong}, + booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition}, + pages={5693--5703}, + year={2019} +} +``` + +
+ + + +
+CrowdPose (CVPR'2019) + +```bibtex +@article{li2018crowdpose, + title={CrowdPose: Efficient Crowded Scenes Pose Estimation and A New Benchmark}, + author={Li, Jiefeng and Wang, Can and Zhu, Hao and Mao, Yihuan and Fang, Hao-Shu and Lu, Cewu}, + journal={arXiv preprint arXiv:1812.00324}, + year={2018} +} +``` + +
+ +Results on CrowdPose test without multi-scale test + +| Arch | Input Size | AP | AP50 | AP75 | AR | AR50 | ckpt | log | +| :-------------------------------------------- | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :-------------------------------------------: | :-------------------------------------------: | +| [HRNet-w32](/configs/body/2d_kpt_sview_rgb_img/disentangled_keypoint_regression/crowdpose/hrnet_w32_crowdpose_512x512.py) | 512x512 | 0.663 | 0.857 | 0.715 | 0.719 | 0.893 | [ckpt](https://download.openmmlab.com/mmpose/bottom_up/dekr/hrnet_w32_crowdpose_512x512-685aff75_20220924.pth) | [log](https://download.openmmlab.com/mmpose/bottom_up/dekr/hrnet_w32_crowdpose_512x512-20220924.log.json) | +| [HRNet-w48](/configs/body/2d_kpt_sview_rgb_img/disentangled_keypoint_regression/crowdpose/hrnet_w48_crowdpose_640x640.py) | 640x640 | 0.682 | 0.869 | 0.736 | 0.742 | 0.911 | [ckpt](https://download.openmmlab.com/mmpose/bottom_up/dekr/hrnet_w48_crowdpose_640x640-ef6b6040_20220930.pth) | [log](https://download.openmmlab.com/mmpose/bottom_up/dekr/hrnet_w48_crowdpose_640x640-20220930.log.json) | + +Results on CrowdPose test with multi-scale test. 3 default scales (\[2, 1, 0.5\]) are used + +| Arch | Input Size | AP | AP50 | AP75 | AR | AR50 | ckpt | +| :------------------------------------------------------------------ | :--------: | :---: | :-------------: | :-------------: | :---: | :-------------: | :------------------------------------------------------------------: | +| [HRNet-w32](/configs/body/2d_kpt_sview_rgb_img/disentangled_keypoint_regression/crowdpose/hrnet_w32_crowdpose_512x512_multiscale.py)\* | 512x512 | 0.692 | 0.874 | 0.748 | 0.755 | 0.926 | [ckpt](https://download.openmmlab.com/mmpose/bottom_up/dekr/hrnet_w32_crowdpose_512x512-685aff75_20220924.pth) | +| [HRNet-w48](/configs/body/2d_kpt_sview_rgb_img/disentangled_keypoint_regression/crowdpose/hrnet_w48_crowdpose_640x640_multiscale.py)\* | 640x640 | 0.696 | 0.869 | 0.749 | 0.769 | 0.933 | [ckpt](https://download.openmmlab.com/mmpose/bottom_up/dekr/hrnet_w48_crowdpose_640x640-ef6b6040_20220930.pth) | + +\* these configs are generally used for evaluation. The training settings are identical to their single-scale counterparts. diff --git a/configs/body/2d_kpt_sview_rgb_img/dekr/crowdpose/hrnet_crowdpose.yml b/configs/body/2d_kpt_sview_rgb_img/dekr/crowdpose/hrnet_crowdpose.yml new file mode 100644 index 0000000000..0b5cdd530a --- /dev/null +++ b/configs/body/2d_kpt_sview_rgb_img/dekr/crowdpose/hrnet_crowdpose.yml @@ -0,0 +1,73 @@ +Collections: +- Name: DEKR + Paper: + Title: Bottom-up human pose estimation via disentangled keypoint regression + URL: https://arxiv.org/abs/2104.02300 + README: https://github.com/open-mmlab/mmpose/blob/master/docs/en/papers/algorithms/dekr.md +Models: +- Config: configs/body/2d_kpt_sview_rgb_img/disentangled_keypoint_regression/crowdpose/hrnet_w32_crowdpose_512x512.py + In Collection: DEKR + Metadata: + Architecture: &id001 + - DEKR + - HRNet + Training Data: CrowdPose + Name: disentangled_keypoint_regression_hrnet_w32_crowdpose_512x512 + Results: + - Dataset: CrowdPose + Metrics: + AP: 0.663 + AP@0.5: 0.857 + AP@0.75: 0.715 + AR: 0.719 + AR@0.5: 0.893 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/bottom_up/dekr/hrnet_w32_crowdpose_512x512-685aff75_20220924.pth +- Config: configs/body/2d_kpt_sview_rgb_img/disentangled_keypoint_regression/crowdpose/hrnet_w48_crowdpose_640x640.py + In Collection: DEKR + Metadata: + Architecture: *id001 + Training Data: CrowdPose + Name: disentangled_keypoint_regression_hrnet_w48_crowdpose_640x640 + Results: + - Dataset: CrowdPose + Metrics: + AP: 0.682 + AP@0.5: 0.869 + AP@0.75: 0.736 + AR: 0.742 + AR@0.5: 0.911 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/bottom_up/dekr/hrnet_w48_crowdpose_640x640-ef6b6040_20220930.pth +- Config: configs/body/2d_kpt_sview_rgb_img/disentangled_keypoint_regression/crowdpose/hrnet_w32_crowdpose_512x512_multiscale.py + In Collection: DEKR + Metadata: + Architecture: *id001 + Training Data: CrowdPose + Name: disentangled_keypoint_regression_hrnet_w32_crowdpose_512x512_multiscale + Results: + - Dataset: CrowdPose + Metrics: + AP: 0.692 + AP@0.5: 0.874 + AP@0.75: 0.748 + AR: 0.755 + AR@0.5: 0.926 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/bottom_up/dekr/hrnet_w32_crowdpose_512x512-685aff75_20220924.pth +- Config: configs/body/2d_kpt_sview_rgb_img/disentangled_keypoint_regression/crowdpose/hrnet_w48_crowdpose_640x640_multiscale.py + In Collection: DEKR + Metadata: + Architecture: *id001 + Training Data: CrowdPose + Name: disentangled_keypoint_regression_hrnet_w48_crowdpose_640x640_multiscale + Results: + - Dataset: CrowdPose + Metrics: + AP: 0.696 + AP@0.5: 0.869 + AP@0.75: 0.749 + AR: 0.769 + AR@0.5: 0.933 + Task: Body 2D Keypoint + Weights: https://download.openmmlab.com/mmpose/bottom_up/dekr/hrnet_w48_crowdpose_640x640-ef6b6040_20220930.pth diff --git a/configs/body/2d_kpt_sview_rgb_img/dekr/crowdpose/hrnet_w32_crowdpose_512x512.py b/configs/body/2d_kpt_sview_rgb_img/dekr/crowdpose/hrnet_w32_crowdpose_512x512.py new file mode 100644 index 0000000000..4f5dc61626 --- /dev/null +++ b/configs/body/2d_kpt_sview_rgb_img/dekr/crowdpose/hrnet_w32_crowdpose_512x512.py @@ -0,0 +1,195 @@ +_base_ = [ + '../../../../_base_/default_runtime.py', + '../../../../_base_/datasets/crowdpose.py' +] +checkpoint_config = dict(interval=20) +evaluation = dict(interval=20, metric='mAP', save_best='AP') + +optimizer = dict( + type='Adam', + lr=0.001, +) +optimizer_config = dict(grad_clip=None) +# learning policy +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=500, + warmup_ratio=0.001, + step=[200, 260]) +total_epochs = 300 +channel_cfg = dict( + num_output_channels=14, + dataset_joints=14, + dataset_channel=[ + [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13], + ], + inference_channel=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]) + +data_cfg = dict( + image_size=512, + base_size=256, + base_sigma=2, + heatmap_size=[128, 256], + num_joints=channel_cfg['dataset_joints'], + dataset_channel=channel_cfg['dataset_channel'], + inference_channel=channel_cfg['inference_channel'], + num_scales=2, + scale_aware_sigma=False, +) + +# model settings +model = dict( + type='DisentangledKeypointRegressor', + pretrained='https://download.openmmlab.com/mmpose/' + 'pretrain_models/hrnet_w32-36af842e.pth', + backbone=dict( + type='HRNet', + in_channels=3, + extra=dict( + stage1=dict( + num_modules=1, + num_branches=1, + block='BOTTLENECK', + num_blocks=(4, ), + num_channels=(64, )), + stage2=dict( + num_modules=1, + num_branches=2, + block='BASIC', + num_blocks=(4, 4), + num_channels=(32, 64)), + stage3=dict( + num_modules=4, + num_branches=3, + block='BASIC', + num_blocks=(4, 4, 4), + num_channels=(32, 64, 128)), + stage4=dict( + num_modules=3, + num_branches=4, + block='BASIC', + num_blocks=(4, 4, 4, 4), + num_channels=(32, 64, 128, 256), + multiscale_output=True)), + ), + keypoint_head=dict( + type='DEKRHead', + in_channels=(32, 64, 128, 256), + in_index=(0, 1, 2, 3), + num_heatmap_filters=32, + num_joints=channel_cfg['dataset_joints'], + input_transform='resize_concat', + heatmap_loss=dict( + type='JointsMSELoss', + use_target_weight=True, + loss_weight=1.0, + ), + offset_loss=dict( + type='SoftWeightSmoothL1Loss', + use_target_weight=True, + supervise_empty=False, + loss_weight=0.004, + beta=1 / 9.0, + )), + train_cfg=dict(), + test_cfg=dict( + num_joints=channel_cfg['dataset_joints'], + max_num_people=30, + project2image=False, + align_corners=False, + max_pool_kernel=5, + use_nms=True, + nms_dist_thr=0.05, + nms_joints_thr=7, + keypoint_threshold=0.01, + rescore_cfg=dict( + in_channels=59, + norm_indexes=(0, 1), + pretrained='https://download.openmmlab.com/mmpose/' + 'pretrain_models/kpt_rescore_crowdpose-300c7efe.pth'), + flip_test=True)) + +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='BottomUpRandomAffine', + rot_factor=30, + scale_factor=[0.75, 1.5], + scale_type='short', + trans_factor=40), + dict(type='BottomUpRandomFlip', flip_prob=0.5), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict(type='GetKeypointCenterArea'), + dict( + type='BottomUpGenerateHeatmapTarget', + sigma=(2, 4), + gen_center_heatmap=True, + bg_weight=0.1, + ), + dict( + type='BottomUpGenerateOffsetTarget', + radius=4, + ), + dict( + type='Collect', + keys=['img', 'heatmaps', 'masks', 'offsets', 'offset_weights'], + meta_keys=[]), +] + +val_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='BottomUpGetImgSize', test_scale_factor=[1]), + dict( + type='BottomUpResizeAlign', + transforms=[ + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + ]), + dict( + type='Collect', + keys=['img'], + meta_keys=[ + 'image_file', 'aug_data', 'test_scale_factor', 'base_size', + 'center', 'scale', 'flip_index', 'num_joints', 'skeleton', + 'image_size', 'heatmap_size' + ]), +] + +test_pipeline = val_pipeline + +data_root = 'data/crowdpose' +data = dict( + workers_per_gpu=4, + train_dataloader=dict(samples_per_gpu=10), + val_dataloader=dict(samples_per_gpu=1), + test_dataloader=dict(samples_per_gpu=1), + train=dict( + type='BottomUpCrowdPoseDataset', + ann_file=f'{data_root}/annotations/mmpose_crowdpose_trainval.json', + img_prefix=f'{data_root}/images/', + data_cfg=data_cfg, + pipeline=train_pipeline, + dataset_info={{_base_.dataset_info}}), + val=dict( + type='BottomUpCrowdPoseDataset', + ann_file=f'{data_root}/annotations/mmpose_crowdpose_test.json', + img_prefix=f'{data_root}/images/', + data_cfg=data_cfg, + pipeline=val_pipeline, + dataset_info={{_base_.dataset_info}}), + test=dict( + type='BottomUpCrowdPoseDataset', + ann_file=f'{data_root}/annotations/mmpose_crowdpose_test.json', + img_prefix=f'{data_root}/images/', + data_cfg=data_cfg, + pipeline=test_pipeline, + dataset_info={{_base_.dataset_info}}), +) diff --git a/configs/body/2d_kpt_sview_rgb_img/dekr/crowdpose/hrnet_w32_crowdpose_512x512_multiscale.py b/configs/body/2d_kpt_sview_rgb_img/dekr/crowdpose/hrnet_w32_crowdpose_512x512_multiscale.py new file mode 100644 index 0000000000..8a79699dc8 --- /dev/null +++ b/configs/body/2d_kpt_sview_rgb_img/dekr/crowdpose/hrnet_w32_crowdpose_512x512_multiscale.py @@ -0,0 +1,40 @@ +_base_ = ['hrnet_w32_crowdpose_512x512.py'] + +model = dict( + test_cfg=dict( + multi_scale_score_decrease=0.9, + nms_dist_thr=0.1, + )) + +val_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='BottomUpGetImgSize', + base_length=32, + test_scale_factor=[0.5, 1, 2]), + dict( + type='BottomUpResizeAlign', + base_length=32, + transforms=[ + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + ]), + dict( + type='Collect', + keys=['img'], + meta_keys=[ + 'image_file', 'aug_data', 'test_scale_factor', 'base_size', + 'center', 'scale', 'flip_index', 'num_joints', 'skeleton', + 'image_size', 'heatmap_size' + ]), +] + +test_pipeline = val_pipeline + +data = dict( + val=dict(pipeline=val_pipeline), + test=dict(pipeline=test_pipeline), +) diff --git a/configs/body/2d_kpt_sview_rgb_img/dekr/crowdpose/hrnet_w48_crowdpose_640x640.py b/configs/body/2d_kpt_sview_rgb_img/dekr/crowdpose/hrnet_w48_crowdpose_640x640.py new file mode 100644 index 0000000000..4e071bcc9a --- /dev/null +++ b/configs/body/2d_kpt_sview_rgb_img/dekr/crowdpose/hrnet_w48_crowdpose_640x640.py @@ -0,0 +1,195 @@ +_base_ = [ + '../../../../_base_/default_runtime.py', + '../../../../_base_/datasets/crowdpose.py' +] +checkpoint_config = dict(interval=20) +evaluation = dict(interval=20, metric='mAP', save_best='AP') + +optimizer = dict( + type='Adam', + lr=0.001, +) +optimizer_config = dict(grad_clip=None) +# learning policy +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=500, + warmup_ratio=0.001, + step=[200, 260]) +total_epochs = 300 +channel_cfg = dict( + num_output_channels=14, + dataset_joints=14, + dataset_channel=[ + [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13], + ], + inference_channel=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]) + +data_cfg = dict( + image_size=640, + base_size=320, + base_sigma=2, + heatmap_size=[160], + num_joints=channel_cfg['dataset_joints'], + dataset_channel=channel_cfg['dataset_channel'], + inference_channel=channel_cfg['inference_channel'], + num_scales=1, + scale_aware_sigma=False, +) + +# model settings +model = dict( + type='DisentangledKeypointRegressor', + pretrained='https://download.openmmlab.com/mmpose/' + 'pretrain_models/hrnet_w48-8ef0771d.pth', + backbone=dict( + type='HRNet', + in_channels=3, + extra=dict( + stage1=dict( + num_modules=1, + num_branches=1, + block='BOTTLENECK', + num_blocks=(4, ), + num_channels=(64, )), + stage2=dict( + num_modules=1, + num_branches=2, + block='BASIC', + num_blocks=(4, 4), + num_channels=(48, 96)), + stage3=dict( + num_modules=4, + num_branches=3, + block='BASIC', + num_blocks=(4, 4, 4), + num_channels=(48, 96, 192)), + stage4=dict( + num_modules=3, + num_branches=4, + block='BASIC', + num_blocks=(4, 4, 4, 4), + num_channels=(48, 96, 192, 384), + multiscale_output=True)), + ), + keypoint_head=dict( + type='DEKRHead', + in_channels=(48, 96, 192, 384), + in_index=(0, 1, 2, 3), + num_heatmap_filters=48, + num_joints=channel_cfg['dataset_joints'], + input_transform='resize_concat', + heatmap_loss=dict( + type='JointsMSELoss', + use_target_weight=True, + loss_weight=1.0, + ), + offset_loss=dict( + type='SoftWeightSmoothL1Loss', + use_target_weight=True, + supervise_empty=False, + loss_weight=0.004, + beta=1 / 9.0, + )), + train_cfg=dict(), + test_cfg=dict( + num_joints=channel_cfg['dataset_joints'], + max_num_people=30, + project2image=False, + align_corners=False, + max_pool_kernel=5, + use_nms=True, + nms_dist_thr=0.05, + nms_joints_thr=8, + keypoint_threshold=0.01, + rescore_cfg=dict( + in_channels=59, + norm_indexes=(0, 1), + pretrained='https://download.openmmlab.com/mmpose/' + 'pretrain_models/kpt_rescore_crowdpose-300c7efe.pth'), + flip_test=True)) + +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='BottomUpRandomAffine', + rot_factor=30, + scale_factor=[0.75, 1.5], + scale_type='short', + trans_factor=40), + dict(type='BottomUpRandomFlip', flip_prob=0.5), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict(type='GetKeypointCenterArea'), + dict( + type='BottomUpGenerateHeatmapTarget', + sigma=(2, 4), + gen_center_heatmap=True, + bg_weight=0.1, + ), + dict( + type='BottomUpGenerateOffsetTarget', + radius=4, + ), + dict( + type='Collect', + keys=['img', 'heatmaps', 'masks', 'offsets', 'offset_weights'], + meta_keys=[]), +] + +val_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='BottomUpGetImgSize', test_scale_factor=[1]), + dict( + type='BottomUpResizeAlign', + transforms=[ + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + ]), + dict( + type='Collect', + keys=['img'], + meta_keys=[ + 'image_file', 'aug_data', 'test_scale_factor', 'base_size', + 'center', 'scale', 'flip_index', 'num_joints', 'skeleton', + 'image_size', 'heatmap_size' + ]), +] + +test_pipeline = val_pipeline + +data_root = 'data/crowdpose' +data = dict( + workers_per_gpu=4, + train_dataloader=dict(samples_per_gpu=5), + val_dataloader=dict(samples_per_gpu=1), + test_dataloader=dict(samples_per_gpu=1), + train=dict( + type='BottomUpCrowdPoseDataset', + ann_file=f'{data_root}/annotations/mmpose_crowdpose_trainval.json', + img_prefix=f'{data_root}/images/', + data_cfg=data_cfg, + pipeline=train_pipeline, + dataset_info={{_base_.dataset_info}}), + val=dict( + type='BottomUpCrowdPoseDataset', + ann_file=f'{data_root}/annotations/mmpose_crowdpose_test.json', + img_prefix=f'{data_root}/images/', + data_cfg=data_cfg, + pipeline=val_pipeline, + dataset_info={{_base_.dataset_info}}), + test=dict( + type='BottomUpCrowdPoseDataset', + ann_file=f'{data_root}/annotations/mmpose_crowdpose_test.json', + img_prefix=f'{data_root}/images/', + data_cfg=data_cfg, + pipeline=test_pipeline, + dataset_info={{_base_.dataset_info}}), +) diff --git a/configs/body/2d_kpt_sview_rgb_img/dekr/crowdpose/hrnet_w48_crowdpose_640x640_multiscale.py b/configs/body/2d_kpt_sview_rgb_img/dekr/crowdpose/hrnet_w48_crowdpose_640x640_multiscale.py new file mode 100644 index 0000000000..05576f0ca5 --- /dev/null +++ b/configs/body/2d_kpt_sview_rgb_img/dekr/crowdpose/hrnet_w48_crowdpose_640x640_multiscale.py @@ -0,0 +1,40 @@ +_base_ = ['hrnet_w48_crowdpose_640x640.py'] + +model = dict( + test_cfg=dict( + multi_scale_score_decrease=0.9, + nms_dist_thr=0.1, + )) + +val_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='BottomUpGetImgSize', + base_length=32, + test_scale_factor=[0.5, 1, 2]), + dict( + type='BottomUpResizeAlign', + base_length=32, + transforms=[ + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + ]), + dict( + type='Collect', + keys=['img'], + meta_keys=[ + 'image_file', 'aug_data', 'test_scale_factor', 'base_size', + 'center', 'scale', 'flip_index', 'num_joints', 'skeleton', + 'image_size', 'heatmap_size' + ]), +] + +test_pipeline = val_pipeline + +data = dict( + val=dict(pipeline=val_pipeline), + test=dict(pipeline=test_pipeline), +) diff --git a/docs/en/papers/algorithms/dekr.md b/docs/en/papers/algorithms/dekr.md new file mode 100644 index 0000000000..ee19a3315b --- /dev/null +++ b/docs/en/papers/algorithms/dekr.md @@ -0,0 +1,31 @@ +# Bottom-up Human Pose Estimation via Disentangled Keypoint Regression + + + +
+DEKR (CVPR'2021) + +```bibtex +@inproceedings{geng2021bottom, + title={Bottom-up human pose estimation via disentangled keypoint regression}, + author={Geng, Zigang and Sun, Ke and Xiao, Bin and Zhang, Zhaoxiang and Wang, Jingdong}, + booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition}, + pages={14676--14686}, + year={2021} +} +``` + +
+ +## Abstract + + + +In this paper, we are interested in the bottom-up paradigm of estimating human poses from an image. We study the dense keypoint regression framework that is previously inferior to the keypoint detection and grouping framework. Our motivation is that regressing keypoint positions accurately needs to learn representations that focus on the keypoint regions. +We present a simple yet effective approach, named disentangled keypoint regression (DEKR). We adopt adaptive convolutions through pixel-wise spatial transformer to activate the pixels in the keypoint regions and accordingly learn representations from them. We use a multi-branch structure for separate regression: each branch learns a representation with dedicated adaptive convolutions and regresses one keypoint. The resulting disentangled representations are able to attend to the keypoint regions, respectively, and thus the keypoint regression is spatially more accurate. We empirically show that the proposed direct regression method outperforms keypoint detection and grouping methods and achieves superior bottom-up pose estimation results on two benchmark datasets, COCO and CrowdPose. The code and models are available at [this https URL](https://github.com/HRNet/DEKR). + + + +
+ +
diff --git a/mmpose/core/evaluation/bottom_up_eval.py b/mmpose/core/evaluation/bottom_up_eval.py index 7b37d7c98e..cab9ace93f 100644 --- a/mmpose/core/evaluation/bottom_up_eval.py +++ b/mmpose/core/evaluation/bottom_up_eval.py @@ -74,7 +74,7 @@ def _resize_average(feature_maps, align_corners, index=-1, resize_size=None): index (int): Only used when `resize_size' is None. If `resize_size' is None, the target size is the size of the indexed feature maps. - resize_size (list[int, int]): The target size [w, h]. + resize_size (list[int, int]): The target size [h, w]. Returns: list[Tensor]: Averaged feature_maps. @@ -105,7 +105,7 @@ def _resize_unsqueeze_concat(feature_maps, index (int): Only used when `resize_size' is None. If `resize_size' is None, the target size is the size of the indexed feature maps. - resize_size (list[int, int]): The target size [w, h]. + resize_size (list[int, int]): The target size [h, w]. Returns: list[Tensor]: Averaged feature_maps. @@ -131,7 +131,7 @@ def _resize_concate(feature_maps, align_corners, index=-1, resize_size=None): index (int): Only used when `resize_size' is None. If `resize_size' is None, the target size is the size of the indexed feature maps. - resize_size (list[int, int]): The target size [w, h]. + resize_size (list[int, int]): The target size [h, w]. Returns: list[Tensor]: Averaged feature_maps. @@ -253,6 +253,8 @@ def aggregate_stage_flip(feature_maps, def aggregate_scale(feature_maps_list, align_corners=False, + project2image=True, + size_projected=None, aggregate_scale='average'): """Aggregate multi-scale outputs. @@ -265,6 +267,7 @@ def aggregate_scale(feature_maps_list, Args: feature_maps_list (list[Tensor]): Aggregated feature maps. project2image (bool): Option to resize to base scale. + size_projected (list[int, int]): Base size of heatmaps [w, h]. align_corners (bool): Align corners when performing interpolation. aggregate_scale (str): Methods to aggregate multi-scale feature maps. Options: 'average', 'unsqueeze_concat'. @@ -277,13 +280,17 @@ def aggregate_scale(feature_maps_list, Tensor: Aggregated feature maps. """ + resize_size = None + if project2image and size_projected: + resize_size = (size_projected[1], size_projected[0]) + if aggregate_scale == 'average': output_feature_maps = _resize_average( - feature_maps_list, align_corners, index=0, resize_size=None) + feature_maps_list, align_corners, index=0, resize_size=resize_size) elif aggregate_scale == 'unsqueeze_concat': output_feature_maps = _resize_unsqueeze_concat( - feature_maps_list, align_corners, index=0, resize_size=None) + feature_maps_list, align_corners, index=0, resize_size=resize_size) else: NotImplementedError() diff --git a/mmpose/core/post_processing/__init__.py b/mmpose/core/post_processing/__init__.py index 8076b799b9..7c36f595dd 100644 --- a/mmpose/core/post_processing/__init__.py +++ b/mmpose/core/post_processing/__init__.py @@ -1,6 +1,6 @@ # Copyright (c) OpenMMLab. All rights reserved. -from .nms import oks_iou, oks_nms, soft_oks_nms +from .nms import nearby_joints_nms, oks_iou, oks_nms, soft_oks_nms from .one_euro_filter import OneEuroFilter from .post_transforms import (affine_transform, flip_back, fliplr_joints, fliplr_regression, get_affine_transform, @@ -9,8 +9,8 @@ from .smoother import Smoother __all__ = [ - 'oks_nms', 'soft_oks_nms', 'affine_transform', 'rotate_point', 'flip_back', - 'fliplr_joints', 'fliplr_regression', 'transform_preds', - 'get_affine_transform', 'get_warp_matrix', 'warp_affine_joints', 'oks_iou', - 'OneEuroFilter', 'Smoother' + 'oks_nms', 'soft_oks_nms', 'nearby_joints_nms', 'affine_transform', + 'rotate_point', 'flip_back', 'fliplr_joints', 'fliplr_regression', + 'transform_preds', 'get_affine_transform', 'get_warp_matrix', + 'warp_affine_joints', 'oks_iou', 'OneEuroFilter', 'Smoother' ] diff --git a/mmpose/core/post_processing/group.py b/mmpose/core/post_processing/group.py index 75499cb0bc..5d1b7d5aa2 100644 --- a/mmpose/core/post_processing/group.py +++ b/mmpose/core/post_processing/group.py @@ -416,3 +416,142 @@ def parse(self, heatmaps, tags, adjust=True, refine=True): results = [results] return results, scores + + +class HeatmapOffsetParser: + """The heatmap&offset parser for post processing.""" + + def __init__(self, cfg): + super(HeatmapOffsetParser, self).__init__() + + self.num_joints = cfg['num_joints'] + self.keypoint_threshold = cfg['keypoint_threshold'] + self.max_num_people = cfg['max_num_people'] + + # init pooling layer + kernel_size = cfg.get('max_pool_kernel', 5) + self.pool = torch.nn.MaxPool2d(kernel_size, 1, kernel_size // 2) + + def _offset_to_pose(self, offsets): + """Convert offset maps to pose maps. + + Note: + batch size: N + number of keypoints: K + offset maps height: H + offset maps width: W + + Args: + offsets (torch.Tensor[NxKxHxW]): model output offset maps. + + Returns: + torch.Tensor[NxKxHxW]: A tensor containing pose for each pixel. + """ + h, w = offsets.shape[-2:] + offsets = offsets.view(self.num_joints, -1, h, w) + + # generate regular coordinates + x = torch.arange(0, offsets.shape[-1]).float() + y = torch.arange(0, offsets.shape[-2]).float() + y, x = torch.meshgrid(y, x) + regular_coords = torch.stack((x, y), dim=0).unsqueeze(0) + + posemaps = regular_coords.to(offsets) - offsets + posemaps = posemaps.view(1, -1, h, w) + return posemaps + + def _get_maximum_from_heatmap(self, heatmap): + """Find local maximum of heatmap to localize instances. + + Note: + batch size: N + heatmap height: H + heatmap width: W + + Args: + heatmap (torch.Tensor[Nx1xHxW]): model output center heatmap. + + Returns: + tuple: A tuple containing instances detection results. + + - pos_idx (torch.Tensor): Index of pixels which have detected + instances. + - score (torch.Tensor): Score of detected instances. + """ + assert heatmap.size(0) == 1 and heatmap.size(1) == 1 + max_map = torch.eq(heatmap, self.pool(heatmap)).float() + heatmap = heatmap * max_map + score = heatmap.view(-1) + + score, pos_idx = score.topk(self.max_num_people) + mask = score > self.keypoint_threshold + score = score[mask] + pos_idx = pos_idx[mask] + return pos_idx, score + + def decode(self, heatmaps, offsets): + """Convert center heatmaps and offset maps to poses. + + Note: + batch size: N + number of keypoints: K + offset maps height: H + offset maps width: W + + Args: + heatmaps (torch.Tensor[Nx(1+K)xHxW]): model output heatmaps. + offsets (torch.Tensor[NxKxHxW]): model output offset maps. + + Returns: + torch.Tensor[NxKx4]: A tensor containing predicted pose and + score for each instance. + """ + + posemap = self._offset_to_pose(offsets) + inst_indexes, inst_scores = self._get_maximum_from_heatmap( + heatmaps[:, :1]) + + poses = posemap.view(posemap.size(1), -1)[..., inst_indexes] + poses = poses.view(self.num_joints, 2, -1).permute(2, 0, + 1).contiguous() + inst_scores = inst_scores.unsqueeze(1).unsqueeze(2).expand( + poses.size()) + poses = torch.cat((poses, inst_scores), dim=2) + return poses.clone() + + def refine_score(self, heatmaps, poses): + """Refine instance scores with keypoint heatmaps. + + Note: + batch size: N + number of keypoints: K + offset maps height: H + offset maps width: W + + Args: + heatmaps (torch.Tensor[Nx(1+K)xHxW]): model output heatmaps. + poses (torch.Tensor[NxKx4]): decoded pose and score for each + instance. + + Returns: + torch.Tensor[NxKx4]: poses with refined scores. + """ + normed_poses = poses.unsqueeze(0).permute(2, 0, 1, 3).contiguous() + normed_poses = torch.cat(( + normed_poses.narrow(3, 0, 1) / (heatmaps.size(3) - 1) * 2 - 1, + normed_poses.narrow(3, 1, 1) / (heatmaps.size(2) - 1) * 2 - 1, + ), + dim=3) + kpt_scores = torch.nn.functional.grid_sample( + heatmaps[:, 1:].view(self.num_joints, 1, heatmaps.size(2), + heatmaps.size(3)), + normed_poses, + padding_mode='border').view(self.num_joints, -1) + kpt_scores = kpt_scores.transpose(0, 1).contiguous() + + # scores only from keypoint heatmaps + poses[..., 3] = kpt_scores + # combine center and keypoint heatmaps + poses[..., 2] = poses[..., 2] * kpt_scores + + return poses diff --git a/mmpose/core/post_processing/nms.py b/mmpose/core/post_processing/nms.py index 86a0ab35e0..8c197203cd 100644 --- a/mmpose/core/post_processing/nms.py +++ b/mmpose/core/post_processing/nms.py @@ -161,7 +161,7 @@ def soft_oks_nms(kpts_db, """Soft OKS NMS implementations. Args: - kpts_db + kpts_db: keypoints and scores. thr: retain oks overlap < thr. max_dets: max number of detections to keep. sigmas: Keypoint labelling uncertainty. @@ -205,3 +205,75 @@ def soft_oks_nms(kpts_db, keep = keep[:keep_cnt] return keep + + +def nearby_joints_nms( + kpts_db, + dist_thr, + num_nearby_joints_thr=None, + score_per_joint=False, + max_dets=-1, +): + """Nearby joints NMS implementations. + + Args: + kpts_db (list[dict]): keypoints and scores. + dist_thr (float): threshold for judging whether two joints are close. + num_nearby_joints_thr (int): threshold for judging whether two + instances are close. + max_dets (int): max number of detections to keep. + score_per_joint (bool): the input scores (in kpts_db) are per joint + scores. + + Returns: + np.ndarray: indexes to keep. + """ + + assert dist_thr > 0, '`dist_thr` must be greater than 0.' + if len(kpts_db) == 0: + return [] + + if score_per_joint: + scores = np.array([k['score'].mean() for k in kpts_db]) + else: + scores = np.array([k['score'] for k in kpts_db]) + + kpts = np.array([k['keypoints'] for k in kpts_db]) + + num_people, num_joints, _ = kpts.shape + if num_nearby_joints_thr is None: + num_nearby_joints_thr = num_joints // 2 + assert num_nearby_joints_thr < num_joints, '`num_nearby_joints_thr` must '\ + 'be less than the number of joints.' + + # compute distance threshold + pose_area = kpts.max(axis=1) - kpts.min(axis=1) + pose_area = np.sqrt(np.power(pose_area, 2).sum(axis=1)) + pose_area = pose_area.reshape(num_people, 1, 1) + pose_area = np.tile(pose_area, (num_people, num_joints)) + close_dist_thr = pose_area * dist_thr + + # count nearby joints between instances + instance_dist = kpts[:, None] - kpts + instance_dist = np.sqrt(np.power(instance_dist, 2).sum(axis=3)) + close_instance_num = (instance_dist < close_dist_thr).sum(2) + close_instance = close_instance_num > num_nearby_joints_thr + + # apply nms + ignored_pose_inds, keep_pose_inds = set(), list() + indexes = np.argsort(scores)[::-1] + for i in indexes: + if i in ignored_pose_inds: + continue + keep_inds = close_instance[i].nonzero()[0] + keep_ind = keep_inds[np.argmax(scores[keep_inds])] + if keep_ind not in ignored_pose_inds: + keep_pose_inds.append(keep_ind) + ignored_pose_inds = ignored_pose_inds.union(set(keep_inds)) + + # limit the number of output instances + if max_dets > 0 and len(keep_pose_inds) > max_dets: + sub_inds = np.argsort(scores[keep_pose_inds])[-1:-max_dets - 1:-1] + keep_pose_inds = [keep_pose_inds[i] for i in sub_inds] + + return keep_pose_inds diff --git a/mmpose/core/post_processing/post_transforms.py b/mmpose/core/post_processing/post_transforms.py index 8d9dfc8b99..b0c077309c 100644 --- a/mmpose/core/post_processing/post_transforms.py +++ b/mmpose/core/post_processing/post_transforms.py @@ -187,7 +187,7 @@ def transform_preds(coords, center, scale, output_size, use_udp=False): scale_x = scale[0] / output_size[0] scale_y = scale[1] / output_size[1] - target_coords = np.ones_like(coords) + target_coords = coords.copy() target_coords[:, 0] = coords[:, 0] * scale_x + center[0] - scale[0] * 0.5 target_coords[:, 1] = coords[:, 1] * scale_y + center[1] - scale[1] * 0.5 diff --git a/mmpose/datasets/pipelines/bottom_up_transform.py b/mmpose/datasets/pipelines/bottom_up_transform.py index a2de37c9ee..d1fb613835 100644 --- a/mmpose/datasets/pipelines/bottom_up_transform.py +++ b/mmpose/datasets/pipelines/bottom_up_transform.py @@ -20,6 +20,7 @@ def _get_multi_scale_size(image, input_size, current_scale, min_scale, + base_length=64, use_udp=False): """Get the size for multi-scale training. @@ -28,6 +29,8 @@ def _get_multi_scale_size(image, input_size (np.ndarray[2]): Size (w, h) of the image input. current_scale (float): Scale factor. min_scale (float): Minimal scale. + base_length (int): The width and height should be multiples of + base_length. Default: 64. use_udp (bool): To use unbiased data processing. Paper ref: Huang et al. The Devil is in the Details: Delving into Unbiased Data Processing for Human Pose Estimation (CVPR 2020). @@ -36,20 +39,20 @@ def _get_multi_scale_size(image, tuple: A tuple containing multi-scale sizes. - (w_resized, h_resized) (tuple(int)): resized width/height - - center (np.ndarray)image center + - center (np.ndarray): image center - scale (np.ndarray): scales wrt width/height """ assert len(input_size) == 2 h, w, _ = image.shape # calculate the size for min_scale - min_input_w = _ceil_to_multiples_of(min_scale * input_size[0], 64) - min_input_h = _ceil_to_multiples_of(min_scale * input_size[1], 64) + min_input_w = _ceil_to_multiples_of(min_scale * input_size[0], base_length) + min_input_h = _ceil_to_multiples_of(min_scale * input_size[1], base_length) if w < h: w_resized = int(min_input_w * current_scale / min_scale) h_resized = int( - _ceil_to_multiples_of(min_input_w / w * h, 64) * current_scale / - min_scale) + _ceil_to_multiples_of(min_input_w / w * h, base_length) * + current_scale / min_scale) if use_udp: scale_w = w - 1.0 scale_h = (h_resized - 1.0) / (w_resized - 1.0) * (w - 1.0) @@ -59,8 +62,8 @@ def _get_multi_scale_size(image, else: h_resized = int(min_input_h * current_scale / min_scale) w_resized = int( - _ceil_to_multiples_of(min_input_h / h * w, 64) * current_scale / - min_scale) + _ceil_to_multiples_of(min_input_h / h * w, base_length) * + current_scale / min_scale) if use_udp: scale_h = h - 1.0 scale_w = (w_resized - 1.0) / (h_resized - 1.0) * (h - 1.0) @@ -74,7 +77,11 @@ def _get_multi_scale_size(image, return (w_resized, h_resized), center, np.array([scale_w, scale_h]) -def _resize_align_multi_scale(image, input_size, current_scale, min_scale): +def _resize_align_multi_scale(image, + input_size, + current_scale, + min_scale, + base_length=64): """Resize the images for multi-scale training. Args: @@ -82,6 +89,8 @@ def _resize_align_multi_scale(image, input_size, current_scale, min_scale): input_size (np.ndarray[2]): Size (w, h) of the image input current_scale (float): Current scale min_scale (float): Minimal scale + base_length (int): The width and height should be multiples of + base_length. Default: 64. Returns: tuple: A tuple containing image info. @@ -92,7 +101,7 @@ def _resize_align_multi_scale(image, input_size, current_scale, min_scale): """ assert len(input_size) == 2 size_resized, center, scale = _get_multi_scale_size( - image, input_size, current_scale, min_scale) + image, input_size, current_scale, min_scale, base_length) trans = get_affine_transform(center, scale, 0, size_resized) image_resized = cv2.warpAffine(image, trans, size_resized) @@ -100,7 +109,11 @@ def _resize_align_multi_scale(image, input_size, current_scale, min_scale): return image_resized, center, scale -def _resize_align_multi_scale_udp(image, input_size, current_scale, min_scale): +def _resize_align_multi_scale_udp(image, + input_size, + current_scale, + min_scale, + base_length=64): """Resize the images for multi-scale training. Args: @@ -108,6 +121,8 @@ def _resize_align_multi_scale_udp(image, input_size, current_scale, min_scale): input_size (np.ndarray[2]): Size (w, h) of the image input current_scale (float): Current scale min_scale (float): Minimal scale + base_length (int): The width and height should be multiples of + base_length. Default: 64. Returns: tuple: A tuple containing image info. @@ -118,10 +133,11 @@ def _resize_align_multi_scale_udp(image, input_size, current_scale, min_scale): """ assert len(input_size) == 2 size_resized, _, _ = _get_multi_scale_size(image, input_size, - current_scale, min_scale, True) + current_scale, min_scale, + base_length, True) _, center, scale = _get_multi_scale_size(image, input_size, min_scale, - min_scale, True) + min_scale, base_length, True) trans = get_warp_matrix( theta=0, @@ -211,6 +227,77 @@ def __call__(self, joints): return hms +class OffsetGenerator: + """Generate offset maps for bottom-up models. + + Args: + num_joints (int): Number of keypoints + output_size (np.ndarray): Size (w, h) of feature map + radius (int): Radius of area assigned with valid offset + """ + + def __init__(self, output_size, num_joints, radius=4): + if not isinstance(output_size, np.ndarray): + output_size = np.array(output_size) + if output_size.size > 1: + assert len(output_size) == 2 + self.output_size = output_size + else: + self.output_size = np.array([output_size, output_size], + dtype=np.int) + self.num_joints = num_joints + assert radius > 0, f'`radius` must be a positive value, ' \ + f'but got {radius}' + self.radius = radius + + def __call__(self, center, joints, area): + """Generate offset maps.""" + + offset_map = np.zeros( + (self.num_joints * 2, self.output_size[1], self.output_size[0]), + dtype=np.float32) + weight_map = np.zeros( + (self.num_joints * 2, self.output_size[1], self.output_size[0]), + dtype=np.float32) + area_map = np.zeros((self.output_size[1], self.output_size[0]), + dtype=np.float32) + + for i in range(len(center)): + x_center, y_center = center[i, 0, 0], center[i, 0, 1] + if center[i, 0, 2] < 1 or x_center < 0 or y_center < 0 \ + or x_center >= self.output_size[0] \ + or y_center >= self.output_size[1]: + continue + + for j in range(self.num_joints): + x, y = joints[i, j, :2] + if joints[i, j, 2] < 1 or x >= self.output_size[0] \ + or y >= self.output_size[1] or x < 0 or y < 0: + continue + + start_x = max(int(x_center - self.radius), 0) + start_y = max(int(y_center - self.radius), 0) + end_x = min(int(x_center + self.radius), self.output_size[0]) + end_y = min(int(y_center + self.radius), self.output_size[1]) + + for pos_x in range(start_x, end_x): + for pos_y in range(start_y, end_y): + offset_x = pos_x - x + offset_y = pos_y - y + if offset_map[j*2, pos_y, pos_x] != 0 \ + or offset_map[j*2+1, pos_y, pos_x] != 0: + if area_map[pos_y, pos_x] < area[i]: + continue + offset_map[j * 2, pos_y, pos_x] = offset_x + offset_map[j * 2 + 1, pos_y, pos_x] = offset_y + weight_map[j * 2, pos_y, pos_x] = 1. / np.sqrt(area[i]) + weight_map[j * 2 + 1, pos_y, + pos_x] = 1. / np.sqrt(area[i]) + area_map[pos_y, pos_x] = area[i] + + return offset_map, weight_map + + class JointsEncoder: """Encodes the visible joints into (coordinates, score); The coordinate of one joint and its score are of `int` type. @@ -359,6 +446,55 @@ def __call__(self, joints): return pafs +@PIPELINES.register_module() +class GetKeypointCenterArea: + """Copmute center and area from keypoitns for each instance. + + Required key: 'joints'. + + Modifies key: 'center' and 'area'. + + Args: + minimal_area (float): Minimum of allowed area. Instance with + smaller area will be ignored in training. Default: 32. + """ + + def __init__(self, minimal_area=32): + self.minimal_area = minimal_area + + def __call__(self, results): + """Copmute center and area from keypoitns for each instance.""" + + center_list = [] + area_list = [] + + for joints in results['joints']: + + area = np.zeros((joints.shape[0]), dtype=np.float32) + center = np.zeros((joints.shape[0], 1, 3), dtype=np.float32) + for i in range(joints.shape[0]): + visible_joints = joints[i][joints[i][..., 2] > 0][..., :2] + if visible_joints.size == 0: + continue + + center[i, 0, :2] = visible_joints.mean(axis=0, keepdims=True) + center[i, 0, 2] = 1 + + area[i] = np.power( + visible_joints.max(axis=0) - visible_joints.min(axis=0), + 2)[:2].sum() + if area[i] < self.minimal_area: + center[i, 0, 2] = 0 + + center_list.append(center) + area_list.append(area) + + results['center'] = center_list + results['area'] = area_list + + return results + + @PIPELINES.register_module() class BottomUpRandomFlip: """Data augmentation with random image flip for bottom-up. @@ -584,38 +720,138 @@ def __call__(self, results): class BottomUpGenerateHeatmapTarget: """Generate multi-scale heatmap target for bottom-up. + Required key: 'joints', 'mask' and 'center'. + + Modifies key: 'target', 'heatmaps' and 'masks'. + Args: - sigma (int): Sigma of heatmap Gaussian - max_num_people (int): Maximum number of people in an image + sigma (int or tuple): Sigma of heatmap Gaussian. If sigma is a tuple, + the first item should be the sigma of keypoints and the second + item should be the sigma of center. + bg_weight (float): Weight for background. Default: 1.0. + gen_center_heatmap (bool): Whether to generate heatmaps for instance + centers. Default: False. use_udp (bool): To use unbiased data processing. Paper ref: Huang et al. The Devil is in the Details: Delving into Unbiased Data Processing for Human Pose Estimation (CVPR 2020). """ - def __init__(self, sigma, use_udp=False): + def __init__(self, + sigma, + bg_weight=1.0, + gen_center_heatmap=False, + use_udp=False): + + if isinstance(sigma, int): + sigma = (sigma, ) + if gen_center_heatmap: + assert len(sigma) == 2, 'sigma for centers must be given if ' \ + '`gen_center_heatmap` is True. ' \ + 'e.g. sigma=(2, 4)' + self.sigma = sigma + self.bg_weight = bg_weight + self.gen_center_heatmap = gen_center_heatmap self.use_udp = use_udp - def _generate(self, num_joints, heatmap_size): + def _generate(self, num_joints, sigma, heatmap_size): """Get heatmap generator.""" heatmap_generator = [ - HeatmapGenerator(output_size, num_joints, self.sigma, self.use_udp) + HeatmapGenerator(output_size, num_joints, sigma, self.use_udp) for output_size in heatmap_size ] return heatmap_generator def __call__(self, results): """Generate multi-scale heatmap target for bottom-up.""" + target_list = list() + joints_list = results['joints'] + mask_list = results['mask'] + output_mask_list = [] + heatmap_generator = \ self._generate(results['ann_info']['num_joints'], + self.sigma[0], results['ann_info']['heatmap_size']) - target_list = list() - joints_list = results['joints'] for scale_id in range(results['ann_info']['num_scales']): heatmaps = heatmap_generator[scale_id](joints_list[scale_id]) target_list.append(heatmaps.astype(np.float32)) + + if self.bg_weight != 1: + mask = mask_list[scale_id].copy().astype(np.float32) + mask = mask[None, ...].repeat(heatmaps.shape[0], axis=0) + mask = mask * self.bg_weight + mask[np.logical_and(heatmaps > 0, mask > 0)] = 1 + output_mask_list.append(mask) + + if self.gen_center_heatmap: + center_list = results['center'] + heatmap_generator = self._generate( + 1, self.sigma[1], results['ann_info']['heatmap_size']) + + for scale_id in range(results['ann_info']['num_scales']): + heatmaps = heatmap_generator[scale_id]( + center_list[scale_id]).astype(np.float32) + target_list[scale_id] = np.concatenate( + (heatmaps, target_list[scale_id]), axis=0) + + if self.bg_weight != 1: + mask = mask_list[scale_id].copy().astype(np.float32) + mask = mask[None, ...] * self.bg_weight + mask[np.logical_and(heatmaps > 0, mask > 0)] = 1 + output_mask_list[scale_id] = np.concatenate( + (mask, output_mask_list[scale_id]), axis=0) + results['target'] = target_list + results['heatmaps'] = target_list + results['masks'] = output_mask_list + + return results + + +@PIPELINES.register_module() +class BottomUpGenerateOffsetTarget: + """Generate multi-scale offset target for bottom-up. + + Required key: 'center', 'joints and 'area'. + + Modifies key: 'offsets', 'offset_weights. + + Args: + radius (int): Radius of labeled area for each instance. + """ + + def __init__(self, radius=4): + self.radius = radius + + def _generate(self, num_joints, heatmap_size): + """Get offset generator.""" + offset_generator = [ + OffsetGenerator(output_size, num_joints, self.radius) + for output_size in heatmap_size + ] + return offset_generator + + def __call__(self, results): + """Generate multi-scale offset target for bottom-up.""" + target_list = list() + weight_list = list() + center_list = results['center'] + joints_list = results['joints'] + area_list = results['area'] + + offset_generator = self._generate(results['ann_info']['num_joints'], + results['ann_info']['heatmap_size']) + + for scale_id in range(results['ann_info']['num_scales']): + offset, offset_weight = offset_generator[scale_id]( + center_list[scale_id], joints_list[scale_id], + area_list[scale_id]) + target_list.append(offset.astype(np.float32)) + weight_list.append(offset_weight) + results['offsets'] = target_list + results['offset_weights'] = weight_list return results @@ -894,15 +1130,22 @@ class BottomUpGetImgSize: Args: test_scale_factor (List[float]): Multi scale current_scale (int): default 1 + base_length (int): The width and height should be multiples of + base_length. Default: 64. use_udp (bool): To use unbiased data processing. Paper ref: Huang et al. The Devil is in the Details: Delving into Unbiased Data Processing for Human Pose Estimation (CVPR 2020). """ - def __init__(self, test_scale_factor, current_scale=1, use_udp=False): + def __init__(self, + test_scale_factor, + current_scale=1, + base_length=64, + use_udp=False): self.test_scale_factor = test_scale_factor self.min_scale = min(test_scale_factor) self.current_scale = current_scale + self.base_length = base_length self.use_udp = use_udp def __call__(self, results): @@ -916,41 +1159,13 @@ def __call__(self, results): input_size = np.array([input_size, input_size], dtype=np.int) img = results['img'] - h, w, _ = img.shape - - # calculate the size for min_scale - min_input_w = _ceil_to_multiples_of(self.min_scale * input_size[0], 64) - min_input_h = _ceil_to_multiples_of(self.min_scale * input_size[1], 64) - if w < h: - w_resized = int(min_input_w * self.current_scale / self.min_scale) - h_resized = int( - _ceil_to_multiples_of(min_input_w / w * h, 64) * - self.current_scale / self.min_scale) - if self.use_udp: - scale_w = w - 1.0 - scale_h = (h_resized - 1.0) / (w_resized - 1.0) * (w - 1.0) - else: - scale_w = w / 200.0 - scale_h = h_resized / w_resized * w / 200.0 - else: - h_resized = int(min_input_h * self.current_scale / self.min_scale) - w_resized = int( - _ceil_to_multiples_of(min_input_h / h * w, 64) * - self.current_scale / self.min_scale) - if self.use_udp: - scale_h = h - 1.0 - scale_w = (w_resized - 1.0) / (h_resized - 1.0) * (h - 1.0) - else: - scale_h = h / 200.0 - scale_w = w_resized / h_resized * h / 200.0 - if self.use_udp: - center = (scale_w / 2.0, scale_h / 2.0) - else: - center = np.array([round(w / 2.0), round(h / 2.0)]) + base_size, center, scale = _get_multi_scale_size( + img, input_size, self.current_scale, self.min_scale, + self.base_length, self.use_udp) results['ann_info']['test_scale_factor'] = self.test_scale_factor - results['ann_info']['base_size'] = (w_resized, h_resized) + results['ann_info']['base_size'] = base_size results['ann_info']['center'] = center - results['ann_info']['scale'] = np.array([scale_w, scale_h]) + results['ann_info']['scale'] = scale return results @@ -961,13 +1176,16 @@ class BottomUpResizeAlign: Args: transforms (List): ToTensor & Normalize + base_length (int): The width and height should be multiples of + base_length. Default: 64. use_udp (bool): To use unbiased data processing. Paper ref: Huang et al. The Devil is in the Details: Delving into Unbiased Data Processing for Human Pose Estimation (CVPR 2020). """ - def __init__(self, transforms, use_udp=False): + def __init__(self, transforms, base_length=64, use_udp=False): self.transforms = Compose(transforms) + self.base_length = base_length if use_udp: self._resize_align_multi_scale = _resize_align_multi_scale_udp else: @@ -988,7 +1206,8 @@ def __call__(self, results): for _, s in enumerate(sorted(test_scale_factor, reverse=True)): _results = results.copy() image_resized, _, _ = self._resize_align_multi_scale( - _results['img'], input_size, s, min(test_scale_factor)) + _results['img'], input_size, s, min(test_scale_factor), + self.base_length) _results['img'] = image_resized _results = self.transforms(_results) transformed_img = _results['img'].unsqueeze(0) diff --git a/mmpose/models/detectors/__init__.py b/mmpose/models/detectors/__init__.py index 6c7728d9fa..814be7ba26 100644 --- a/mmpose/models/detectors/__init__.py +++ b/mmpose/models/detectors/__init__.py @@ -7,6 +7,7 @@ from .multi_task import MultiTask from .multiview_pose import (DetectAndRegress, VoxelCenterDetector, VoxelSinglePose) +from .one_stage import DisentangledKeypointRegressor from .pose_lifter import PoseLifter from .posewarper import PoseWarper from .top_down import TopDown @@ -14,5 +15,6 @@ __all__ = [ 'TopDown', 'AssociativeEmbedding', 'CID', 'ParametricMesh', 'MultiTask', 'PoseLifter', 'Interhand3D', 'PoseWarper', 'DetectAndRegress', - 'VoxelCenterDetector', 'VoxelSinglePose', 'GestureRecognizer' + 'VoxelCenterDetector', 'VoxelSinglePose', 'GestureRecognizer', + 'DisentangledKeypointRegressor' ] diff --git a/mmpose/models/detectors/one_stage.py b/mmpose/models/detectors/one_stage.py new file mode 100644 index 0000000000..c4464fa282 --- /dev/null +++ b/mmpose/models/detectors/one_stage.py @@ -0,0 +1,449 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import warnings + +import mmcv +import numpy as np +import torch +from mmcv.image import imwrite +from mmcv.utils.misc import deprecated_api_warning +from mmcv.visualization.image import imshow + +from mmpose.core.evaluation import (aggregate_scale, aggregate_stage_flip, + flip_feature_maps, get_group_preds) +from mmpose.core.post_processing import nearby_joints_nms +from mmpose.core.post_processing.group import HeatmapOffsetParser +from mmpose.core.visualization import imshow_keypoints +from .. import builder +from ..builder import POSENETS +from ..utils import DekrRescoreNet +from .base import BasePose + +try: + from mmcv.runner import auto_fp16 +except ImportError: + warnings.warn('auto_fp16 from mmpose will be deprecated from v0.15.0' + 'Please install mmcv>=1.1.4') + from mmpose.core import auto_fp16 + + +@POSENETS.register_module() +class DisentangledKeypointRegressor(BasePose): + """Disentangled keypoint regression pose detector. + + Args: + backbone (dict): Backbone modules to extract feature. + keypoint_head (dict): Keypoint head to process feature. + train_cfg (dict): Config for training. Default: None. + test_cfg (dict): Config for testing. Default: None. + pretrained (str): Path to the pretrained models. + """ + + def __init__(self, + backbone, + keypoint_head=None, + train_cfg=None, + test_cfg=None, + pretrained=None): + super().__init__() + self.fp16_enabled = False + + self.backbone = builder.build_backbone(backbone) + + if keypoint_head is not None: + self.keypoint_head = builder.build_head(keypoint_head) + + self.train_cfg = train_cfg + self.test_cfg = test_cfg + self.use_udp = test_cfg.get('use_udp', False) + self.parser = HeatmapOffsetParser(self.test_cfg) + self.pretrained = pretrained + + rescore_cfg = test_cfg.get('rescore_cfg', None) + if rescore_cfg is not None: + self.rescore_net = DekrRescoreNet(**rescore_cfg) + + self.init_weights() + + @property + def with_keypoint(self): + """Check if has keypoint_head.""" + return hasattr(self, 'keypoint_head') + + def init_weights(self, pretrained=None): + """Weight initialization for model.""" + if pretrained is not None: + self.pretrained = pretrained + self.backbone.init_weights(self.pretrained) + if self.with_keypoint: + self.keypoint_head.init_weights() + if hasattr(self, 'rescore_net'): + self.rescore_net.init_weight() + + @auto_fp16(apply_to=('img', )) + def forward(self, + img=None, + heatmaps=None, + masks=None, + offsets=None, + offset_weights=None, + img_metas=None, + return_loss=True, + return_heatmap=False, + **kwargs): + """Calls either forward_train or forward_test depending on whether + return_loss is True. + + Note: + - batch_size: N + - num_keypoints: K + - num_img_channel: C + - img_width: imgW + - img_height: imgH + - heatmaps weight: W + - heatmaps height: H + - max_num_people: M + + Args: + img (torch.Tensor[N,C,imgH,imgW]): # input image. + targets (list(torch.Tensor[N,K,H,W])): Multi-scale target heatmaps. + masks (list(torch.Tensor[N,H,W])): Masks of multi-scale target + heatmaps + joints (list(torch.Tensor[N,M,K,2])): Joints of multi-scale target + heatmaps for ae loss + img_metas (dict): Information about val & test. + By default it includes: + + - "image_file": image path + - "aug_data": # input + - "test_scale_factor": test scale factor + - "base_size": base size of # input + - "center": center of image + - "scale": scale of image + - "flip_index": flip index of keypoints + return loss (bool): ``return_loss=True`` for training, + ``return_loss=False`` for validation & test. + return_heatmap (bool) : Option to return heatmap. + + Returns: + dict|tuple: if 'return_loss' is true, then return losses. \ + Otherwise, return predicted poses, scores, image \ + paths and heatmaps. + """ + + if return_loss: + return self.forward_train(img, heatmaps, masks, offsets, + offset_weights, img_metas, **kwargs) + return self.forward_test( + img, img_metas, return_heatmap=return_heatmap, **kwargs) + + def forward_train(self, img, heatmaps, masks, offsets, offset_weights, + img_metas, **kwargs): + """Forward the bottom-up model and calculate the loss. + + Note: + batch_size: N + num_keypoints: K + num_img_channel: C + img_width: imgW + img_height: imgH + heatmaps weight: W + heatmaps height: H + max_num_people: M + + Args: + img (torch.Tensor[N,C,imgH,imgW]): # input image. + targets (List(torch.Tensor[N,K,H,W])): Multi-scale target heatmaps. + masks (List(torch.Tensor[N,H,W])): Masks of multi-scale target + heatmaps + joints (List(torch.Tensor[N,M,K,2])): Joints of multi-scale target + heatmaps for ae loss + img_metas (dict):Information about val&test + By default this includes: + - "image_file": image path + - "aug_data": # input + - "test_scale_factor": test scale factor + - "base_size": base size of # input + - "center": center of image + - "scale": scale of image + - "flip_index": flip index of keypoints + + Returns: + dict: The total loss for bottom-up + """ + + output = self.backbone(img) + + if self.with_keypoint: + output = self.keypoint_head(output) + + # if return loss + losses = dict() + if self.with_keypoint: + keypoint_losses = self.keypoint_head.get_loss( + output, + heatmaps, + masks, + offsets, + offset_weights, + ) + losses.update(keypoint_losses) + + return losses + + def forward_dummy(self, img): + """Used for computing network FLOPs. + + See ``tools/get_flops.py``. + + Args: + img (torch.Tensor): # input image. + + Returns: + Tensor: Outputs. + """ + output = self.backbone(img) + if self.with_keypoint: + output = self.keypoint_head(output) + return output + + def forward_test(self, img, img_metas, return_heatmap=False, **kwargs): + """Inference the one-stage model. + + Note: + - Batchsize: N (currently support batchsize = 1) + - num_img_channel: C + - img_width: imgW + - img_height: imgH + + Args: + flip_index (List(int)): + aug_data (List(Tensor[NxCximgHximgW])): Multi-scale image + num_joints (int): Number of joints of an instsance.\ + test_scale_factor (List(float)): Multi-scale factor + base_size (Tuple(int)): Base size of image when scale is 1 + image_size (int): Short edge of images when scale is 1 + heatmap_size (int): Short edge of outputs when scale is 1 + center (np.ndarray): center of image + scale (np.ndarray): the scale of image + skeleton (List(List(int))): Links of joints + """ + assert img.size(0) == 1 + assert len(img_metas) == 1 + + img_metas = img_metas[0] + + flip_index = img_metas['flip_index'] + aug_data = img_metas['aug_data'] + num_joints = img_metas['num_joints'] + test_scale_factor = img_metas['test_scale_factor'] + base_size = img_metas['base_size'] + image_size = img_metas['image_size'] + heatmap_size = img_metas['heatmap_size'][0] + center = img_metas['center'] + scale = img_metas['scale'] + skeleton = img_metas['skeleton'] + + result = {} + + scale_heatmaps_list = [] + scale_poses_dict = dict() + + for idx, s in enumerate(sorted(test_scale_factor, reverse=True)): + image_resized = aug_data[idx].to(img.device) + + features = self.backbone(image_resized) + if self.with_keypoint: + outputs = self.keypoint_head(features) + heatmaps, offsets = outputs[0] + + if self.test_cfg.get('flip_test', True): + # use flip test + image_flipped = torch.flip(image_resized, [3]) + features_flipped = self.backbone(image_flipped) + if self.with_keypoint: + outputs_flipped = self.keypoint_head(features_flipped) + heatmaps_flipped, offsets_flipped = outputs_flipped[0] + + # compute heatmaps for flipped input image + center_heatmaps_flipped = flip_feature_maps( + [heatmaps_flipped[:, :1]], None)[0] + keypoint_heatmaps_flipped = flip_feature_maps( + [heatmaps_flipped[:, 1:]], flip_index=flip_index)[0] + heatmaps_flipped = torch.cat( + [center_heatmaps_flipped, keypoint_heatmaps_flipped], + dim=1) + + # compute offsets for flipped input image + h, w = offsets_flipped.shape[2], offsets_flipped.shape[3] + offsets_flipped = offsets_flipped.view(num_joints, 2, h, w) + offsets_flipped = offsets_flipped.transpose(1, 0).contiguous() + offsets_flipped[0] = -offsets_flipped[0] - 1 + offsets_flipped = flip_feature_maps([offsets_flipped], + flip_index=flip_index)[0] + offsets_flipped = offsets_flipped.transpose(1, 0).reshape( + 1, -1, h, w) + + heatmaps_flipped = [heatmaps_flipped] + offsets_flipped = [offsets_flipped] + + else: + heatmaps_flipped = None + offsets_flipped = None + + # aggregate heatmaps and offsets + aggregated_heatmaps = aggregate_stage_flip( + [heatmaps], + heatmaps_flipped, + index=-1, + project2image=self.test_cfg['project2image'], + size_projected=base_size, + align_corners=self.test_cfg.get('align_corners', True), + aggregate_stage='average', + aggregate_flip='average')[0] + scale_heatmaps_list.append(aggregated_heatmaps) + + aggregated_offsets = aggregate_stage_flip( + [offsets], + offsets_flipped, + index=-1, + project2image=self.test_cfg['project2image'], + size_projected=base_size, + align_corners=self.test_cfg.get('align_corners', True), + aggregate_stage='average', + aggregate_flip='average')[0] + + poses = self.parser.decode(aggregated_heatmaps, aggregated_offsets) + # rescale pose coordinates to a unified scale + poses[..., :2] *= (image_size * 1.0 / heatmap_size) / s + scale_poses_dict[s] = poses + + # aggregate multi-scale heatmaps + aggregated_heatmaps = aggregate_scale( + scale_heatmaps_list, + align_corners=self.test_cfg.get('align_corners', True), + aggregate_scale='average', + size_projected=base_size) + + # rescale the score of instances inferred from difference scales + max_score_ref = 1 + if len(scale_poses_dict.get(1, [])) > 0: + max_score_ref = scale_poses_dict[1][..., 2].max() + + for s, poses in scale_poses_dict.items(): + if s != 1.0 and poses.shape[0]: + rescale_factor = max_score_ref / poses[..., 2].max() + poses[..., 2] *= rescale_factor * self.test_cfg.get( + 'multi_scale_score_decrease', 1.0) + + poses = torch.cat(tuple(scale_poses_dict.values())) + # refine keypoint scores using keypoint heatmaps + poses = self.parser.refine_score(aggregated_heatmaps, poses) + poses = poses.cpu().numpy() + + # nms + if poses.shape[0] and self.test_cfg.get('use_nms', False): + kpts_db = [] + for i in range(len(poses)): + kpts_db.append( + dict(keypoints=poses[i, :, :2], score=poses[i, :, 3])) + + keep_pose_inds = nearby_joints_nms( + kpts_db, + self.test_cfg['nms_dist_thr'], + self.test_cfg['nms_joints_thr'], + score_per_joint=True, + max_dets=self.test_cfg['max_num_people']) + poses = poses[keep_pose_inds] + scores = poses[..., 2].mean(axis=1) + + # recover the pose to match the size of original image + preds = get_group_preds( + poses[None], center, scale, base_size, use_udp=self.use_udp) + + image_paths = [] + image_paths.append(img_metas['image_file']) + + if return_heatmap: + output_heatmap = aggregated_heatmaps.detach().cpu().numpy() + else: + output_heatmap = None + + # rescore each instance with a pretrained rescore net + if hasattr(self, 'rescore_net') and len(preds) > 0: + re_scores = self.rescore_net(np.stack(preds, axis=0), skeleton) + re_scores = re_scores.cpu().numpy() + re_scores[np.isnan(re_scores)] = 0 + scores *= re_scores + + result['preds'] = preds + result['scores'] = scores + result['image_paths'] = image_paths + result['output_heatmap'] = output_heatmap + + return result + + @deprecated_api_warning({'pose_limb_color': 'pose_link_color'}, + cls_name='AssociativeEmbedding') + def show_result(self, + img, + result, + skeleton=None, + kpt_score_thr=0.3, + bbox_color=None, + pose_kpt_color=None, + pose_link_color=None, + radius=4, + thickness=1, + font_scale=0.5, + win_name='', + show=False, + show_keypoint_weight=False, + wait_time=0, + out_file=None): + """Draw `result` over `img`. + + Args: + img (str or Tensor): The image to be displayed. + result (list[dict]): The results to draw over `img` + (bbox_result, pose_result). + skeleton (list[list]): The connection of keypoints. + skeleton is 0-based indexing. + kpt_score_thr (float, optional): Minimum score of keypoints + to be shown. Default: 0.3. + pose_kpt_color (np.array[Nx3]`): Color of N keypoints. + If None, do not draw keypoints. + pose_link_color (np.array[Mx3]): Color of M links. + If None, do not draw links. + radius (int): Radius of circles. + thickness (int): Thickness of lines. + font_scale (float): Font scales of texts. + win_name (str): The window name. + show (bool): Whether to show the image. Default: False. + show_keypoint_weight (bool): Whether to change the transparency + using the predicted confidence scores of keypoints. + wait_time (int): Value of waitKey param. + Default: 0. + out_file (str or None): The filename to write the image. + Default: None. + + Returns: + Tensor: Visualized image only if not `show` or `out_file` + """ + img = mmcv.imread(img) + img = img.copy() + img_h, img_w, _ = img.shape + + pose_result = [] + for res in result: + pose_result.append(res['keypoints']) + + imshow_keypoints(img, pose_result, skeleton, kpt_score_thr, + pose_kpt_color, pose_link_color, radius, thickness) + + if show: + imshow(img, win_name, wait_time) + + if out_file is not None: + imwrite(img, out_file) + + return img diff --git a/mmpose/models/heads/__init__.py b/mmpose/models/heads/__init__.py index 9bbc767fc4..41b8aa190a 100644 --- a/mmpose/models/heads/__init__.py +++ b/mmpose/models/heads/__init__.py @@ -5,6 +5,7 @@ from .cid_head import CIDHead from .deconv_head import DeconvHead from .deeppose_regression_head import DeepposeRegressionHead +from .dekr_head import DEKRHead from .hmr_head import HMRMeshHead from .interhand_3d_head import Interhand3DHead from .mtut_head import MultiModalSSAHead @@ -22,5 +23,5 @@ 'AEHigherResolutionHead', 'AESimpleHead', 'AEMultiStageHead', 'CIDHead', 'DeepposeRegressionHead', 'TemporalRegressionHead', 'Interhand3DHead', 'HMRMeshHead', 'DeconvHead', 'ViPNASHeatmapSimpleHead', 'CuboidCenterHead', - 'CuboidPoseHead', 'MultiModalSSAHead' + 'CuboidPoseHead', 'MultiModalSSAHead', 'DEKRHead' ] diff --git a/mmpose/models/heads/dekr_head.py b/mmpose/models/heads/dekr_head.py new file mode 100644 index 0000000000..c3d1f8efb0 --- /dev/null +++ b/mmpose/models/heads/dekr_head.py @@ -0,0 +1,245 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import copy + +import torch +import torch.nn as nn +from mmcv.cnn import (ConvModule, build_activation_layer, build_conv_layer, + build_norm_layer, constant_init, normal_init) + +from mmpose.models.builder import build_loss +from ..backbones.resnet import BasicBlock +from ..builder import HEADS +from .deconv_head import DeconvHead + +try: + from mmcv.ops import DeformConv2d + has_mmcv_full = True +except (ImportError, ModuleNotFoundError): + has_mmcv_full = False + + +class AdaptiveActivationBlock(nn.Module): + """Adaptive activation convolution block. "Bottom-up human pose estimation + via disentangled keypoint regression", CVPR'2021. + + Args: + in_channels (int): Number of input channels + out_channels (int): Number of output channels + groups (int): Number of groups. Generally equal to the + number of joints. + norm_cfg (dict): Config for normalization layers. + act_cfg (dict): Config for activation layers. + """ + + def __init__(self, + in_channels, + out_channels, + groups=1, + norm_cfg=dict(type='BN'), + act_cfg=dict(type='ReLU')): + + super(AdaptiveActivationBlock, self).__init__() + + assert in_channels % groups == 0 and out_channels % groups == 0 + self.groups = groups + + regular_matrix = torch.tensor([[-1, -1, -1, 0, 0, 0, 1, 1, 1], + [-1, 0, 1, -1, 0, 1, -1, 0, 1], + [1, 1, 1, 1, 1, 1, 1, 1, 1]]) + self.register_buffer('regular_matrix', regular_matrix.float()) + + self.transform_matrix_conv = build_conv_layer( + dict(type='Conv2d'), + in_channels=in_channels, + out_channels=6 * groups, + kernel_size=3, + padding=1, + groups=groups, + bias=True) + + if has_mmcv_full: + self.adapt_conv = DeformConv2d( + in_channels, + out_channels, + kernel_size=3, + padding=1, + bias=False, + groups=groups, + deform_groups=groups) + else: + raise ImportError('Please install the full version of mmcv ' + 'to use `DeformConv2d`.') + + self.norm = build_norm_layer(norm_cfg, out_channels)[1] + self.act = build_activation_layer(act_cfg) + + def forward(self, x): + B, _, H, W = x.size() + residual = x + + affine_matrix = self.transform_matrix_conv(x) + affine_matrix = affine_matrix.permute(0, 2, 3, 1).contiguous() + affine_matrix = affine_matrix.view(B, H, W, self.groups, 2, 3) + offset = torch.matmul(affine_matrix, self.regular_matrix) + offset = offset.transpose(4, 5).reshape(B, H, W, self.groups * 18) + offset = offset.permute(0, 3, 1, 2).contiguous() + + x = self.adapt_conv(x, offset) + x = self.norm(x) + x = self.act(x + residual) + + return x + + +@HEADS.register_module() +class DEKRHead(DeconvHead): + """DisEntangled Keypoint Regression head. "Bottom-up human pose estimation + via disentangled keypoint regression", CVPR'2021. + + Args: + in_channels (int): Number of input channels. + num_joints (int): Number of joints. + num_heatmap_filters (int): Number of filters for heatmap branch. + num_offset_filters_per_joint (int): Number of filters for each joint. + in_index (int|Sequence[int]): Input feature index. Default: 0 + input_transform (str|None): Transformation type of input features. + Options: 'resize_concat', 'multiple_select', None. + Default: None. + + - 'resize_concat': Multiple feature maps will be resized to the + same size as the first one and then concat together. + Usually used in FCN head of HRNet. + - 'multiple_select': Multiple feature maps will be bundle into + a list and passed into decode head. + - None: Only one select feature map is allowed. + align_corners (bool): align_corners argument of F.interpolate. + Default: False. + heatmap_loss (dict): Config for heatmap loss. Default: None. + offset_loss (dict): Config for offset loss. Default: None. + """ + + def __init__(self, + in_channels, + num_joints, + num_heatmap_filters=32, + num_offset_filters_per_joint=15, + in_index=0, + input_transform=None, + num_deconv_layers=0, + num_deconv_filters=None, + num_deconv_kernels=None, + extra=dict(final_conv_kernel=0), + align_corners=False, + heatmap_loss=None, + offset_loss=None): + + super().__init__( + in_channels, + out_channels=in_channels, + num_deconv_layers=num_deconv_layers, + num_deconv_filters=num_deconv_filters, + num_deconv_kernels=num_deconv_kernels, + align_corners=align_corners, + in_index=in_index, + input_transform=input_transform, + extra=extra, + loss_keypoint=heatmap_loss) + + # set up filters for heatmap + self.heatmap_conv_layers = nn.Sequential( + ConvModule( + in_channels=self.in_channels, + out_channels=num_heatmap_filters, + kernel_size=1, + norm_cfg=dict(type='BN')), + BasicBlock(num_heatmap_filters, num_heatmap_filters), + build_conv_layer( + dict(type='Conv2d'), + in_channels=num_heatmap_filters, + out_channels=1 + num_joints, + kernel_size=1)) + + # set up filters for offset map + groups = num_joints + num_offset_filters = num_joints * num_offset_filters_per_joint + + self.offset_conv_layers = nn.Sequential( + ConvModule( + in_channels=self.in_channels, + out_channels=num_offset_filters, + kernel_size=1, + norm_cfg=dict(type='BN')), + AdaptiveActivationBlock( + num_offset_filters, num_offset_filters, groups=groups), + AdaptiveActivationBlock( + num_offset_filters, num_offset_filters, groups=groups), + build_conv_layer( + dict(type='Conv2d'), + in_channels=num_offset_filters, + out_channels=2 * num_joints, + kernel_size=1, + groups=groups)) + + # set up offset losses + self.offset_loss = build_loss(copy.deepcopy(offset_loss)) + + def get_loss(self, outputs, heatmaps, masks, offsets, offset_weights): + """Calculate the dekr loss. + + Note: + - batch_size: N + - num_channels: C + - num_joints: K + - heatmaps height: H + - heatmaps weight: W + + Args: + outputs (List(torch.Tensor[N,C,H,W])): Multi-scale outputs. + heatmaps (List(torch.Tensor[N,K+1,H,W])): Multi-scale heatmap + targets. + masks (List(torch.Tensor[N,K+1,H,W])): Weights of multi-scale + heatmap targets. + offsets (List(torch.Tensor[N,K*2,H,W])): Multi-scale offset + targets. + offset_weights (List(torch.Tensor[N,K*2,H,W])): Weights of + multi-scale offset targets. + """ + + losses = dict() + + for idx in range(len(outputs)): + pred_heatmap, pred_offset = outputs[idx] + heatmap_weight = masks[idx].view(masks[idx].size(0), + masks[idx].size(1), -1) + losses['loss_hms'] = losses.get('loss_hms', 0) + self.loss( + pred_heatmap, heatmaps[idx], heatmap_weight) + losses['loss_ofs'] = losses.get('loss_ofs', 0) + self.offset_loss( + pred_offset, offsets[idx], offset_weights[idx]) + + return losses + + def forward(self, x): + """Forward function.""" + x = self._transform_inputs(x) + x = self.deconv_layers(x) + x = self.final_layer(x) + heatmap = self.heatmap_conv_layers(x) + offset = self.offset_conv_layers(x) + return [[heatmap, offset]] + + def init_weights(self): + """Initialize model weights.""" + super().init_weights() + for name, m in self.heatmap_conv_layers.named_modules(): + if isinstance(m, nn.Conv2d): + normal_init(m, std=0.001) + elif isinstance(m, nn.BatchNorm2d): + constant_init(m, 1) + for name, m in self.offset_conv_layers.named_modules(): + if isinstance(m, nn.Conv2d): + if 'transform_matrix_conv' in name: + normal_init(m, std=1e-8, bias=0) + else: + normal_init(m, std=0.001) + elif isinstance(m, nn.BatchNorm2d): + constant_init(m, 1) diff --git a/mmpose/models/losses/__init__.py b/mmpose/models/losses/__init__.py index 1ae13c1db0..e4288330d3 100644 --- a/mmpose/models/losses/__init__.py +++ b/mmpose/models/losses/__init__.py @@ -5,27 +5,13 @@ from .mse_loss import JointsMSELoss, JointsOHKMMSELoss from .multi_loss_factory import AELoss, HeatmapLoss, MultiLossFactory from .regression_loss import (BoneLoss, L1Loss, MPJPELoss, MSELoss, RLELoss, - SemiSupervisionLoss, SmoothL1Loss, SoftWingLoss, - WingLoss) + SemiSupervisionLoss, SmoothL1Loss, + SoftWeightSmoothL1Loss, SoftWingLoss, WingLoss) __all__ = [ - 'JointsMSELoss', - 'JointsOHKMMSELoss', - 'HeatmapLoss', - 'AELoss', - 'MultiLossFactory', - 'MeshLoss', - 'GANLoss', - 'SmoothL1Loss', - 'WingLoss', - 'MPJPELoss', - 'MSELoss', - 'L1Loss', - 'BCELoss', - 'BoneLoss', - 'SemiSupervisionLoss', - 'SoftWingLoss', - 'AdaptiveWingLoss', - 'RLELoss', - 'FocalHeatmapLoss', + 'JointsMSELoss', 'JointsOHKMMSELoss', 'HeatmapLoss', 'AELoss', + 'MultiLossFactory', 'MeshLoss', 'GANLoss', 'SmoothL1Loss', 'WingLoss', + 'MPJPELoss', 'MSELoss', 'L1Loss', 'BCELoss', 'BoneLoss', + 'SemiSupervisionLoss', 'SoftWingLoss', 'AdaptiveWingLoss', 'RLELoss', + 'SoftWeightSmoothL1Loss', 'FocalHeatmapLoss' ] diff --git a/mmpose/models/losses/mse_loss.py b/mmpose/models/losses/mse_loss.py index f972efadfd..8710feaba1 100644 --- a/mmpose/models/losses/mse_loss.py +++ b/mmpose/models/losses/mse_loss.py @@ -17,8 +17,9 @@ class JointsMSELoss(nn.Module): def __init__(self, use_target_weight=False, loss_weight=1.): super().__init__() - self.criterion = nn.MSELoss() self.use_target_weight = use_target_weight + reduction = 'none' if use_target_weight else 'mean' + self.criterion = nn.MSELoss(reduction=reduction) self.loss_weight = loss_weight def forward(self, output, target, target_weight): @@ -36,8 +37,9 @@ def forward(self, output, target, target_weight): heatmap_pred = heatmaps_pred[idx].squeeze(1) heatmap_gt = heatmaps_gt[idx].squeeze(1) if self.use_target_weight: - loss += self.criterion(heatmap_pred * target_weight[:, idx], - heatmap_gt * target_weight[:, idx]) + loss_joint = self.criterion(heatmap_pred, heatmap_gt) + loss_joint = loss_joint * target_weight[:, idx] + loss += loss_joint.mean() else: loss += self.criterion(heatmap_pred, heatmap_gt) diff --git a/mmpose/models/losses/regression_loss.py b/mmpose/models/losses/regression_loss.py index fc7aa33847..326b211b09 100644 --- a/mmpose/models/losses/regression_loss.py +++ b/mmpose/models/losses/regression_loss.py @@ -1,5 +1,6 @@ # Copyright (c) OpenMMLab. All rights reserved. import math +from functools import partial import torch import torch.nn as nn @@ -528,3 +529,79 @@ def forward(self, output, target): losses['bone_loss'] = loss_bone return losses + + +@LOSSES.register_module() +class SoftWeightSmoothL1Loss(nn.Module): + """Smooth L1 loss with soft weight for regression. + + Args: + use_target_weight (bool): Option to use weighted MSE loss. + Different joint types may have different target weights. + supervise_empty (bool): Whether to supervise the output with zero + weight. + beta (float): Specifies the threshold at which to change between + L1 and L2 loss. + loss_weight (float): Weight of the loss. Default: 1.0. + """ + + def __init__(self, + use_target_weight=False, + supervise_empty=True, + beta=1.0, + loss_weight=1.): + super().__init__() + + reduction = 'none' if use_target_weight else 'mean' + self.criterion = partial( + self.smooth_l1_loss, reduction=reduction, beta=beta) + + self.supervise_empty = supervise_empty + self.use_target_weight = use_target_weight + self.loss_weight = loss_weight + + @staticmethod + def smooth_l1_loss(input, target, reduction='none', beta=1.0): + """Re-implement torch.nn.functional.smooth_l1_loss with beta to support + pytorch <= 1.6.""" + delta = input - target + mask = delta.abs() < beta + delta[mask] = (delta[mask]).pow(2) / (2 * beta) + delta[~mask] = delta[~mask].abs() - beta / 2 + + if reduction == 'mean': + return delta.mean() + elif reduction == 'sum': + return delta.sum() + elif reduction == 'none': + return delta + else: + raise ValueError(f'reduction must be \'mean\', \'sum\' or ' + f'\'none\', but got \'{reduction}\'') + + def forward(self, output, target, target_weight=None): + """Forward function. + + Note: + - batch_size: N + - num_keypoints: K + - dimension of keypoints: D (D=2 or D=3) + + Args: + output (torch.Tensor[N, K, D]): Output regression. + target (torch.Tensor[N, K, D]): Target regression. + target_weight (torch.Tensor[N, K, D]): + Weights across different joint types. + """ + if self.use_target_weight: + assert target_weight is not None + loss = self.criterion(output, target) * target_weight + if self.supervise_empty: + loss = loss.mean() + else: + num_elements = torch.nonzero(target_weight > 0).size()[0] + loss = loss.sum() / max(num_elements, 1.0) + else: + loss = self.criterion(output, target) + + return loss * self.loss_weight diff --git a/mmpose/models/utils/__init__.py b/mmpose/models/utils/__init__.py index d2ed972655..3206f5cc80 100644 --- a/mmpose/models/utils/__init__.py +++ b/mmpose/models/utils/__init__.py @@ -4,6 +4,7 @@ from .misc import torch_meshgrid_ij from .ops import resize from .realnvp import RealNVP +from .rescore import DekrRescoreNet from .smpl import SMPL from .tcformer_utils import (TCFormerDynamicBlock, TCFormerRegularBlock, TokenConv, cluster_dpc_knn, merge_tokens, @@ -15,5 +16,5 @@ 'PatchMerging', 'batch_rodrigues', 'quat_to_rotmat', 'rot6d_to_rotmat', 'resize', 'RealNVP', 'torch_meshgrid_ij', 'token2map', 'TokenConv', 'TCFormerRegularBlock', 'TCFormerDynamicBlock', 'cluster_dpc_knn', - 'merge_tokens', 'token_interp', 'tcformer_convert' + 'merge_tokens', 'token_interp', 'tcformer_convert', 'DekrRescoreNet' ] diff --git a/mmpose/models/utils/rescore.py b/mmpose/models/utils/rescore.py new file mode 100644 index 0000000000..2b70d4f680 --- /dev/null +++ b/mmpose/models/utils/rescore.py @@ -0,0 +1,77 @@ +# Copyright (c) OpenMMLab. All rights reserved. +# Code is modified from `HRNet/DEKR `. + +import torch +from mmcv.runner import load_checkpoint + + +class DekrRescoreNet(torch.nn.Module): + """Rescore net used to predict the OKS score of predicted pose. We use the + off-the-shelf rescore net pretrained by authors of DEKR. + + Args: + in_channels (int): input channels + norm_indexes (Tuple(int)): indexes of torso in skeleton. + pretrained (str): url or path of pretrained rescore net. + """ + + def __init__( + self, + in_channels, + norm_indexes, + pretrained=None, + ): + super(DekrRescoreNet, self).__init__() + + self.pretrained = pretrained + self.norm_indexes = norm_indexes + + hidden = 256 + + self.l1 = torch.nn.Linear(in_channels, hidden, bias=True) + self.l2 = torch.nn.Linear(hidden, hidden, bias=True) + self.l3 = torch.nn.Linear(hidden, 1, bias=True) + self.relu = torch.nn.ReLU() + + def make_feature(self, poses, skeleton): + """Combine original scores, joint distance and relative distance to + make feature. + + Args: + poses (np.ndarray): predicetd poses + skeleton (list(list(int))): joint links + + Returns: + torch.Tensor: feature for each instance + """ + poses = torch.tensor(poses) + joint_1, joint_2 = zip(*skeleton) + num_link = len(skeleton) + + joint_relate = (poses[:, joint_1] - poses[:, joint_2])[:, :, :2] + joint_length = joint_relate.norm(dim=2) + + # To use the torso distance to normalize + normalize = (joint_length[:, self.norm_indexes[0]] + + joint_length[:, self.norm_indexes[1]]) / 2 + normalize = normalize.unsqueeze(1).expand(normalize.size(0), num_link) + normalize = normalize.clamp(min=1).contiguous() + + joint_length = joint_length / normalize[:, :] + joint_relate = joint_relate / normalize.unsqueeze(-1) + joint_relate = joint_relate.flatten(1) + + feature = torch.cat((joint_relate, joint_length, poses[..., 2]), + dim=1).float() + return feature + + def forward(self, poses, skeleton): + feature = self.make_feature(poses, skeleton).to(self.l1.weight.device) + x = self.relu(self.l1(feature)) + x = self.relu(self.l2(x)) + x = self.l3(x) + return x.squeeze(1) + + def init_weight(self): + if self.pretrained is not None: + load_checkpoint(self, self.pretrained, map_location='cpu') diff --git a/model-index.yml b/model-index.yml index 5539d644d2..27d610f88f 100644 --- a/model-index.yml +++ b/model-index.yml @@ -28,6 +28,8 @@ Import: - configs/body/2d_kpt_sview_rgb_img/deeppose/coco/resnet_rle_coco.yml - configs/body/2d_kpt_sview_rgb_img/deeppose/mpii/resnet_mpii.yml - configs/body/2d_kpt_sview_rgb_img/deeppose/mpii/resnet_rle_mpii.yml +- configs/body/2d_kpt_sview_rgb_img/dekr/coco/hrnet_coco.yml +- configs/body/2d_kpt_sview_rgb_img/dekr/crowdpose/hrnet_crowdpose.yml - configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/aic/hrnet_aic.yml - configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/aic/resnet_aic.yml - configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/alexnet_coco.yml diff --git a/tests/test_evaluation/test_bottom_up_eval.py b/tests/test_evaluation/test_bottom_up_eval.py index 0459ae1bd9..c807ab922d 100644 --- a/tests/test_evaluation/test_bottom_up_eval.py +++ b/tests/test_evaluation/test_bottom_up_eval.py @@ -77,6 +77,12 @@ def test_aggregate_scale(): assert isinstance(output, torch.Tensor) assert output.shape == fake_outputs[0].shape + output = aggregate_scale( + fake_outputs, size_projected=(4, 3), aggregate_scale='average') + assert isinstance(output, torch.Tensor) + assert output.shape[:2] == fake_outputs[0].shape[:2] + assert output.shape[2:] == (3, 4) + output = aggregate_scale( fake_outputs, align_corners=False, aggregate_scale='unsqueeze_concat') diff --git a/tests/test_losses/test_regression_losses.py b/tests/test_losses/test_regression_losses.py index 9c4e030f57..9866c9cd61 100644 --- a/tests/test_losses/test_regression_losses.py +++ b/tests/test_losses/test_regression_losses.py @@ -1,4 +1,5 @@ # Copyright (c) OpenMMLab. All rights reserved. +import pytest import torch from mmpose.models import build_loss @@ -238,3 +239,43 @@ def test_semi_supervision_loss(): losses = loss(fake_pred, fake_label) assert torch.allclose(losses['proj_loss'], torch.tensor(0.)) assert torch.allclose(losses['bone_loss'], torch.tensor(0.)) + + +def test_soft_weight_smooth_l1_loss(): + loss_cfg = dict( + type='SoftWeightSmoothL1Loss', use_target_weight=False, beta=0.5) + loss = build_loss(loss_cfg) + + fake_pred = torch.zeros((1, 3, 2)) + fake_label = torch.zeros((1, 3, 2)) + assert torch.allclose(loss(fake_pred, fake_label), torch.tensor(0.)) + + fake_pred = torch.ones((1, 3, 2)) + fake_label = torch.zeros((1, 3, 2)) + assert torch.allclose(loss(fake_pred, fake_label), torch.tensor(.75)) + + loss_cfg = dict( + type='SoftWeightSmoothL1Loss', + use_target_weight=True, + supervise_empty=True) + loss = build_loss(loss_cfg) + + fake_pred = torch.ones((1, 3, 2)) + fake_label = torch.zeros((1, 3, 2)) + fake_weight = torch.arange(6).reshape(1, 3, 2).float() + assert torch.allclose( + loss(fake_pred, fake_label, fake_weight), torch.tensor(1.25)) + + loss_cfg = dict( + type='SoftWeightSmoothL1Loss', + use_target_weight=True, + supervise_empty=False) + loss = build_loss(loss_cfg) + assert torch.allclose( + loss(fake_pred, fake_label, fake_weight), torch.tensor(1.5)) + + with pytest.raises(ValueError): + _ = loss.smooth_l1_loss(fake_pred, fake_label, reduction='fake') + + output = loss.smooth_l1_loss(fake_pred, fake_label, reduction='sum') + assert torch.allclose(output, torch.tensor(3.0)) diff --git a/tests/test_models/test_bottom_up_head.py b/tests/test_models/test_bottom_up_head.py index 4748f31b1e..f74c8c108a 100644 --- a/tests/test_models/test_bottom_up_head.py +++ b/tests/test_models/test_bottom_up_head.py @@ -3,7 +3,7 @@ import pytest import torch -from mmpose.models import AEHigherResolutionHead, AESimpleHead +from mmpose.models import AEHigherResolutionHead, AESimpleHead, DEKRHead def test_ae_simple_head(): @@ -469,6 +469,46 @@ def test_ae_higherresolution_head(): assert out[1].shape == torch.Size([1, 34, 64, 64]) +def test_DEKRHead(): + head = DEKRHead( + in_channels=64, + num_joints=17, + num_heatmap_filters=32, + num_offset_filters_per_joint=15, + in_index=0, + heatmap_loss=dict( + type='JointsMSELoss', + use_target_weight=True, + ), + offset_loss=dict( + type='SoftWeightSmoothL1Loss', + use_target_weight=True, + )) + head.init_weights() + input_shape = (1, 64, 128, 128) + inputs = _demo_inputs(input_shape) + + # test forward + output = head([inputs]) + assert len(output) == 1 + assert len(output[0]) == 2 + heatmaps, offsets = output[0] + assert heatmaps.size(1) == 18 + assert heatmaps.size(2) == 128 + assert offsets.size(1) == 34 + assert offsets.size(2) == 128 + + # test get_loss + heatmaps_target = torch.rand(heatmaps.size()) + heatmaps_weight = torch.rand(heatmaps.size()) + offsets_target = torch.rand(offsets.size()) + offsets_weight = torch.rand(offsets.size()) + loss = head.get_loss(output, [heatmaps_target], [heatmaps_weight], + [offsets_target], [offsets_weight]) + assert 'loss_hms' in loss + assert 'loss_ofs' in loss + + def _demo_inputs(input_shape=(1, 3, 64, 64)): """Create a superset of inputs needed to run backbone. diff --git a/tests/test_models/test_one_stage_forward.py b/tests/test_models/test_one_stage_forward.py new file mode 100644 index 0000000000..0540fcecff --- /dev/null +++ b/tests/test_models/test_one_stage_forward.py @@ -0,0 +1,167 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import numpy as np +import pytest +import torch + +from mmpose.models.detectors import DisentangledKeypointRegressor + + +def test_dekr_forward(): + model_cfg = dict( + type='DisentangledKeypointRegressor', + pretrained=None, + backbone=dict( + type='HRNet', + in_channels=3, + extra=dict( + stage1=dict( + num_modules=1, + num_branches=1, + block='BOTTLENECK', + num_blocks=(4, ), + num_channels=(64, )), + stage2=dict( + num_modules=1, + num_branches=2, + block='BASIC', + num_blocks=(4, 4), + num_channels=(32, 64)), + stage3=dict( + num_modules=4, + num_branches=3, + block='BASIC', + num_blocks=(4, 4, 4), + num_channels=(32, 64, 128)), + stage4=dict( + num_modules=3, + num_branches=4, + block='BASIC', + num_blocks=(4, 4, 4, 4), + num_channels=(32, 64, 128, 256), + multiscale_output=True)), + ), + keypoint_head=dict( + type='DEKRHead', + in_channels=(32, 64, 128, 256), + in_index=(0, 1, 2, 3), + num_joints=17, + input_transform='resize_concat', + heatmap_loss=dict( + type='JointsMSELoss', + use_target_weight=True, + ), + offset_loss=dict( + type='SoftWeightSmoothL1Loss', + use_target_weight=True, + )), + train_cfg=dict(), + test_cfg=dict( + num_joints=17, + max_num_people=30, + project2image=False, + align_corners=False, + nms_kernel=5, + nms_padding=2, + use_nms=True, + nms_dist_thr=0.05, + nms_joints_thr=8, + keypoint_threshold=0.01, + rescore_cfg=dict(in_channels=74, norm_indexes=(5, 6)), + flip_test=True)) + + detector = DisentangledKeypointRegressor(model_cfg['backbone'], + model_cfg['keypoint_head'], + model_cfg['train_cfg'], + model_cfg['test_cfg'], + model_cfg['pretrained']) + + with pytest.raises(TypeError): + detector.init_weights(pretrained=dict()) + detector.pretrained = model_cfg['pretrained'] + detector.init_weights() + + input_shape = (1, 3, 256, 256) + mm_inputs = _demo_mm_inputs(input_shape) + + imgs = mm_inputs.pop('imgs') + heatmaps = mm_inputs.pop('heatmaps') + masks = mm_inputs.pop('masks') + offsets = mm_inputs.pop('offsets') + offset_weights = mm_inputs.pop('offset_weights') + img_metas = mm_inputs.pop('img_metas') + + # Test forward train + losses = detector.forward( + imgs, + heatmaps, + masks, + offsets, + offset_weights, + img_metas, + return_loss=True) + assert isinstance(losses, dict) + + # Test forward test + detector.eval() + with torch.no_grad(): + _ = detector.forward(imgs, img_metas=img_metas, return_loss=False) + _ = detector.forward_dummy(imgs) + + # test rescore net + preds = np.random.rand(2, 17, 3) + _ = detector.rescore_net(preds, img_metas[0]['skeleton']) + + # test without flip_test + detector.test_cfg['flip_test'] = False + _ = detector.forward(imgs, img_metas=img_metas, return_loss=False) + + +def _demo_mm_inputs(input_shape=(1, 3, 256, 256)): + """Create a superset of inputs needed to run test or train batches. + + Args: + input_shape (tuple): + input batch dimensions + """ + (N, C, H, W) = input_shape + + rng = np.random.RandomState(0) + + imgs = rng.rand(*input_shape) + heatmaps = np.zeros([N, 18, H // 4, W // 4], dtype=np.float32) + masks = np.ones([N, 18, H // 4, W // 4], dtype=np.float32) + offsets = np.zeros([N, 34, H // 4, W // 4], dtype=np.float32) + offset_weights = np.ones([N, 34, H // 4, W // 4], dtype=np.float32) + + img_metas = [{ + 'image_file': + 'test.jpg', + 'num_joints': + 17, + 'aug_data': [torch.zeros(1, 3, 256, 256), + torch.zeros(1, 3, 128, 128)], + 'test_scale_factor': [1, 0.5], + 'base_size': (256, 256), + 'image_size': + 256, + 'heatmap_size': [64], + 'center': + np.array([128, 128]), + 'scale': + np.array([1.28, 1.28]), + 'flip_index': + [0, 2, 1, 4, 3, 6, 5, 8, 7, 10, 9, 12, 11, 14, 13, 16, 15], + 'skeleton': [[15, 13], [13, 11], [16, 14], [14, 12], [11, 12], [5, 11], + [6, 12], [5, 6], [5, 7], [6, 8], [7, 9], [8, 10], [1, 2], + [0, 1], [0, 2], [1, 3], [2, 4], [3, 5], [4, 6]] + } for _ in range(N)] + + mm_inputs = { + 'imgs': torch.FloatTensor(imgs).requires_grad_(True), + 'heatmaps': [torch.FloatTensor(heatmaps)], + 'masks': [torch.FloatTensor(masks)], + 'offsets': [torch.FloatTensor(offsets)], + 'offset_weights': [torch.FloatTensor(offset_weights)], + 'img_metas': img_metas + } + return mm_inputs diff --git a/tests/test_pipelines/test_bottom_up_pipelines.py b/tests/test_pipelines/test_bottom_up_pipelines.py index 6d05c633bd..efb151af3c 100644 --- a/tests/test_pipelines/test_bottom_up_pipelines.py +++ b/tests/test_pipelines/test_bottom_up_pipelines.py @@ -2,17 +2,20 @@ import copy import os.path as osp +import cv2 import numpy as np import pytest import xtcocotools from xtcocotools.coco import COCO -from mmpose.datasets.pipelines import (BottomUpGenerateHeatmapTarget, +from mmpose.datasets.pipelines import BottomUpGenerateHeatmapTarget # noqa +from mmpose.datasets.pipelines import (BottomUpGenerateOffsetTarget, BottomUpGeneratePAFTarget, BottomUpGenerateTarget, BottomUpGetImgSize, BottomUpRandomAffine, BottomUpRandomFlip, BottomUpResizeAlign, + GetKeypointCenterArea, LoadImageFromFile) @@ -34,7 +37,7 @@ def _get_mask(coco, anno, img_id): for rle in rles: m += xtcocotools.mask.decode(rle) - return m < 0.5 + return (m < 0.5).astype(np.float32) def _get_joints(anno, ann_info, int_sigma): @@ -155,7 +158,7 @@ def test_bottomup_pipeline(): results_horizontal_flip = random_horizontal_flip( copy.deepcopy(results_copy)) - # test TopDownAffine + # test BottomUpRandomAffine random_affine_transform = BottomUpRandomAffine(30, [0.75, 1.5], 'short', 0) results_affine_transform = random_affine_transform(copy.deepcopy(results)) assert results_affine_transform['img'].shape == (512, 384, 3) @@ -372,21 +375,121 @@ def test_BottomUpGenerateHeatmapTarget(): ] joints = _get_joints(anno, ann_info, False) - mask_list = [mask.copy() for _ in range(ann_info['num_scales'])] + mask_list = [ + cv2.resize(mask.copy(), (size, size)) + for size in ann_info['heatmap_size'] + ] joints_list = [joints.copy() for _ in range(ann_info['num_scales'])] + center_list = [ + joints.mean(axis=1, keepdims=True) for joints in joints_list + ] results = {} results['dataset'] = 'coco' results['image_file'] = osp.join(data_prefix, '000000000785.jpg') results['mask'] = mask_list results['joints'] = joints_list + results['center'] = center_list results['ann_info'] = ann_info - generate_heatmap_target = BottomUpGenerateHeatmapTarget(2) + generate_heatmap_target = BottomUpGenerateHeatmapTarget((2, 4), 0.1, True) results_generate_heatmap_target = generate_heatmap_target(results) assert 'target' in results_generate_heatmap_target + assert 'heatmaps' in results_generate_heatmap_target + assert 'masks' in results_generate_heatmap_target assert len(results_generate_heatmap_target['target'] ) == results['ann_info']['num_scales'] + assert len(results_generate_heatmap_target['heatmaps'] + ) == results['ann_info']['num_scales'] + assert len(results_generate_heatmap_target['masks'] + ) == results['ann_info']['num_scales'] + + +def test_GetKeypointCenterArea(): + data_prefix = 'tests/data/coco/' + ann_file = osp.join(data_prefix, 'test_coco.json') + coco = COCO(ann_file) + + ann_info = {} + ann_info['num_joints'] = 17 + ann_info['num_scales'] = 2 + ann_info['scale_aware_sigma'] = False + + ann_ids = coco.getAnnIds(785) + anno = coco.loadAnns(ann_ids) + + anno = [ + obj for obj in anno if obj['iscrowd'] == 0 or obj['num_keypoints'] > 0 + ] + joints = _get_joints(anno, ann_info, False) + + joints_list = [joints.copy() for _ in range(ann_info['num_scales'])] + + results = {} + results['dataset'] = 'coco' + results['image_file'] = osp.join(data_prefix, '000000000785.jpg') + results['joints'] = joints_list + results['ann_info'] = ann_info + + get_kpt_center_area = GetKeypointCenterArea(minimal_area=32) + results_get_kpt_center_area = get_kpt_center_area(results) + assert 'center' in results_get_kpt_center_area + assert 'area' in results_get_kpt_center_area + assert len(results_get_kpt_center_area['center'] + ) == results['ann_info']['num_scales'] + assert len(results_get_kpt_center_area['center'][0]) == 1 + assert len(results_get_kpt_center_area['area'] + ) == results['ann_info']['num_scales'] + assert len(results_get_kpt_center_area['area'][0]) == 1 + + for joints in results['joints']: + joints[..., 2] = 0 + results_get_kpt_center_area = get_kpt_center_area(results) + assert len(results_get_kpt_center_area['center']) > 0 + assert results_get_kpt_center_area['center'][0][..., 2] == 0 + + +def test_BottomUpGenerateOffsetTarget(): + data_prefix = 'tests/data/coco/' + ann_file = osp.join(data_prefix, 'test_coco.json') + coco = COCO(ann_file) + + ann_info = {} + ann_info['heatmap_size'] = [[512, 512], 256] + ann_info['num_joints'] = 17 + ann_info['num_scales'] = 2 + ann_info['scale_aware_sigma'] = False + + ann_ids = coco.getAnnIds(785) + anno = coco.loadAnns(ann_ids) + + anno = [ + obj for obj in anno if obj['iscrowd'] == 0 or obj['num_keypoints'] > 0 + ] + joints = _get_joints(anno, ann_info, False) + + joints_list = [joints.copy() for _ in range(ann_info['num_scales'])] + + results = {} + results['dataset'] = 'coco' + results['image_file'] = osp.join(data_prefix, '000000000785.jpg') + results['joints'] = joints_list + results['ann_info'] = ann_info + + get_kpt_center_area = GetKeypointCenterArea(minimal_area=32) + results = get_kpt_center_area(results) + generate_offset_target = BottomUpGenerateOffsetTarget(radius=4) + results_generate_offset_target = generate_offset_target(results) + assert 'offsets' in results_generate_offset_target + assert 'offset_weights' in results_generate_offset_target + assert len(results_generate_offset_target['offsets'] + ) == results['ann_info']['num_scales'] + assert len(results_generate_offset_target['offsets'] + [1]) == results['ann_info']['num_joints'] * 2 + assert len(results_generate_offset_target['offset_weights'] + ) == results['ann_info']['num_scales'] + assert len(results_generate_offset_target['offset_weights'] + [1]) == results['ann_info']['num_joints'] * 2 def test_BottomUpGeneratePAFTarget(): diff --git a/tests/test_post_processing/test_nms.py b/tests/test_post_processing/test_nms.py index 13d793d239..8da2011614 100644 --- a/tests/test_post_processing/test_nms.py +++ b/tests/test_post_processing/test_nms.py @@ -1,7 +1,9 @@ # Copyright (c) OpenMMLab. All rights reserved. import numpy as np +import pytest -from mmpose.core.post_processing.nms import nms, oks_iou, oks_nms, soft_oks_nms +from mmpose.core.post_processing.nms import (nearby_joints_nms, nms, oks_iou, + oks_nms, soft_oks_nms) def test_soft_oks_nms(): @@ -79,3 +81,33 @@ def test_oks_iou(): assert result[0] == 1. result = oks_iou(np.zeros([17 * 3]), np.ones([1, 17 * 3]), 1, [1]) assert result[0] < 0.01 + + +def test_nearby_joints_nms(): + + kpts_db = [] + keep_pose_inds = nearby_joints_nms( + kpts_db, 0.05, score_per_joint=True, max_dets=1) + assert len(keep_pose_inds) == 0 + + kpts_db = [] + for _ in range(5): + kpts_db.append( + dict(keypoints=np.random.rand(3, 2), score=np.random.rand(3))) + keep_pose_inds = nearby_joints_nms( + kpts_db, 0.05, score_per_joint=True, max_dets=1) + assert len(keep_pose_inds) == 1 + assert keep_pose_inds[0] < 5 + + kpts_db = [] + for _ in range(5): + kpts_db.append( + dict(keypoints=np.random.rand(3, 2), score=np.random.rand())) + keep_pose_inds = nearby_joints_nms(kpts_db, 0.05, num_nearby_joints_thr=2) + assert len(keep_pose_inds) <= 5 and len(keep_pose_inds) > 0 + + with pytest.raises(AssertionError): + _ = nearby_joints_nms(kpts_db, 0, num_nearby_joints_thr=2) + + with pytest.raises(AssertionError): + _ = nearby_joints_nms(kpts_db, 0.05, num_nearby_joints_thr=3)