In [2]:
!pip install yacs ultralytics



In [3]:
import os
import time
import pandas as pd
import cv2
import numpy as np
import os
import logging
from yacs.config import CfgNode as CN
from easydict import EasyDict as edict
from collections import OrderedDict
import math

import torch
import torch.nn.parallel
import torch.backends.cudnn as cudnn
import torch.optim
import torch.utils.data
import torch.utils.data.distributed
import torchvision.transforms as transforms
import torchvision
import torch.nn as nn

# Configs

In [33]:
CTX = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
DEBUG = True
cudnn.benchmark = True
torch.backends.cudnn.deterministic = False
torch.backends.cudnn.enabled = True

MPII_KEYPOINT_INDEXES = {
  0: "right ankle",
  1: "right knee",
  2: "right hip", 
  3: "left hip", 
  4: "left knee", 
  5: "left ankle",
  6: "pelvis", 
  7: "thorax", 
  8: "upper neck", 
  9: "head top", 
  10: "right wrist",
  11: "right elbow", 
  12: "right shoulder", 
  13: "left shoulder", 
  14: "left elbow",
  15: "left wrist"
}
NUM_KPTS = len(MPII_KEYPOINT_INDEXES)
SKELETON = {
  "left_lower_leg": [0, 1], 
  "left_thigh": [2, 1], 
  "left_hip": [2, 6], 
  "right_lower_leg": [5, 4],
  "right_thigh": [3, 4],
  "right_hip": [3, 6],
  "torso": [6, 7], 
  "neck": [7, 8],             #actually it's thorax - upper neck 
  "head": [8, 9],
  "right_forearm": [10, 11],
  "right_upper_arm": [11, 12], 
  "right_shoulder": [12, 7],
  "left_forearm": [15, 14],
  "left_upper_arm": [14, 13], 
  "left_shoulder": [13, 7]
}
SQUAT_PART = ["left_lower_leg", "left_thigh", "right_lower_leg",
              "right_thigh", "torso"]
SQUAT_KEYPART = [
  ["left_lower_leg", "left_thigh"],
  ["right_lower_leg", "right_thigh"]
]
SQUAT_STAGE_ANGLE = [[180, 170], [170, 155],
               [155, 137], [137, 114], [114, 0]]

JUMPING_JACK_PART = ["left_lower_leg", "left_thigh", 
  "right_lower_leg", "right_thigh",
  "torso", "right_forearm", "right_upper_arm",
  "left_forearm", "left_upper_arm"]

JUMPING_JACK_KEYPART = [
  ["left_upper_arm", "torso"],
  ["right_upper_arm", "torso"]
]

JUMPING_JACK_STAGE_ANGLE = [[0, 10], [10, 35], [35, 64],
                       [64, 104], [104, 180]]

COLOR = {
  "red": [0, 0, 255],
  "blue": [255, 0, 0],
  "green": [0, 255, 0]
}

## Resnet

In [5]:
resnet_config = edict()

resnet_config.OUTPUT_DIR = ''
resnet_config.LOG_DIR = ''
resnet_config.DATA_DIR = ''
resnet_config.GPUS = '0'
resnet_config.WORKERS = 4
resnet_config.PRINT_FREQ = 20


# pose_resnet related params
POSE_RESNET = edict()
POSE_RESNET.NUM_LAYERS = 50
POSE_RESNET.DECONV_WITH_BIAS = False
POSE_RESNET.NUM_DECONV_LAYERS = 3
POSE_RESNET.NUM_DECONV_FILTERS = [256, 256, 256]
POSE_RESNET.NUM_DECONV_KERNELS = [4, 4, 4]
POSE_RESNET.FINAL_CONV_KERNEL = 1
POSE_RESNET.TARGET_TYPE = 'gaussian'
POSE_RESNET.HEATMAP_SIZE = [64, 64]  # width * height, ex: 24 * 32
POSE_RESNET.SIGMA = 2

MODEL_EXTRAS = {
    'pose_resnet': POSE_RESNET,
}

# common params for NETWORK
resnet_config.MODEL = edict()
resnet_config.MODEL.NAME = 'pose_resnet'
resnet_config.MODEL.INIT_WEIGHTS = True
resnet_config.MODEL.PRETRAINED = '/kaggle/input/cva-models/checkpoint.pth.tar'
resnet_config.MODEL.NUM_JOINTS = 16
resnet_config.MODEL.IMAGE_SIZE = [256, 256]  # width * height, ex: 192 * 256
resnet_config.MODEL.EXTRA = MODEL_EXTRAS[resnet_config.MODEL.NAME]

resnet_config.MODEL.STYLE = 'pytorch'

resnet_config.LOSS = edict()
resnet_config.LOSS.USE_TARGET_WEIGHT = True

# DATASET related params
resnet_config.DATASET = edict()
resnet_config.DATASET.FLIP = True
resnet_config.DATASET.SCALE_FACTOR = 0.25
resnet_config.DATASET.ROT_FACTOR = 30

poseres_cfg = edict(resnet_config)

## HRnet

In [6]:
HRnet_cfg = CN()

HRnet_cfg.OUTPUT_DIR = 'output'
HRnet_cfg.LOG_DIR = 'log'
HRnet_cfg.DATA_DIR = ''
HRnet_cfg.GPUS = (0,)
HRnet_cfg.WORKERS = 24
HRnet_cfg.PRINT_FREQ = 100
HRnet_cfg.AUTO_RESUME = True
HRnet_cfg.PIN_MEMORY = True
HRnet_cfg.RANK = 0

# common params for NETWORK
HRnet_cfg.MODEL = CN()
HRnet_cfg.MODEL.NAME = 'pose_hrnet'
HRnet_cfg.MODEL.INIT_WEIGHTS = True
HRnet_cfg.MODEL.PRETRAINED = '/kaggle/input/cva-models/hrpose_w32_256x256.pth'
HRnet_cfg.MODEL.NUM_JOINTS = 16
HRnet_cfg.MODEL.TAG_PER_JOINT = True
HRnet_cfg.MODEL.TARGET_TYPE = 'gaussian'
HRnet_cfg.MODEL.IMAGE_SIZE = [256, 256]  # width * height, ex: 192 * 256
HRnet_cfg.MODEL.HEATMAP_SIZE = [64, 64]  # width * height, ex: 24 * 32
HRnet_cfg.MODEL.SIGMA = 2

HRnet_cfg.MODEL.EXTRA = CN()
HRnet_cfg.MODEL.EXTRA.PRETRAINED_LAYERS = [
  'conv1',
  'bn1',
  'conv2',
  'bn2',
  'layer1',
  'transition1',
  'stage2',
  'transition2',
  'stage3',
  'transition3',
  'stage4'
]
HRnet_cfg.MODEL.EXTRA.STEM_INPLANES = 64
HRnet_cfg.MODEL.EXTRA.FINAL_CONV_KERNEL = 1

HRnet_cfg.MODEL.EXTRA.STAGE2 = CN()
HRnet_cfg.MODEL.EXTRA.STAGE2.NUM_MODULES = 1
HRnet_cfg.MODEL.EXTRA.STAGE2.NUM_BRANCHES = 2
HRnet_cfg.MODEL.EXTRA.STAGE2.NUM_BLOCKS = [4, 4]
HRnet_cfg.MODEL.EXTRA.STAGE2.NUM_CHANNELS = [32, 64]
HRnet_cfg.MODEL.EXTRA.STAGE2.BLOCK = 'BASIC'
HRnet_cfg.MODEL.EXTRA.STAGE2.FUSE_METHOD = 'SUM'

HRnet_cfg.MODEL.EXTRA.STAGE3 = CN()
HRnet_cfg.MODEL.EXTRA.STAGE3.NUM_MODULES = 4
HRnet_cfg.MODEL.EXTRA.STAGE3.NUM_BRANCHES = 3
HRnet_cfg.MODEL.EXTRA.STAGE3.NUM_BLOCKS = [4, 4, 4]
HRnet_cfg.MODEL.EXTRA.STAGE3.NUM_CHANNELS = [32, 64, 128]
HRnet_cfg.MODEL.EXTRA.STAGE3.BLOCK = 'BASIC'
HRnet_cfg.MODEL.EXTRA.STAGE3.FUSE_METHOD = 'SUM'

HRnet_cfg.MODEL.EXTRA.STAGE4 = CN()
HRnet_cfg.MODEL.EXTRA.STAGE4.NUM_MODULES = 3
HRnet_cfg.MODEL.EXTRA.STAGE4.NUM_BRANCHES = 4
HRnet_cfg.MODEL.EXTRA.STAGE4.NUM_BLOCKS = [4, 4, 4, 4]
HRnet_cfg.MODEL.EXTRA.STAGE4.NUM_CHANNELS = [32, 64, 128, 256]
HRnet_cfg.MODEL.EXTRA.STAGE4.BLOCK = 'BASIC'
HRnet_cfg.MODEL.EXTRA.STAGE4.FUSE_METHOD = 'SUM'

HRnet_cfg.LOSS = CN()
HRnet_cfg.LOSS.USE_TARGET_WEIGHT = True

# DATASET related params
HRnet_cfg.DATASET = CN()
HRnet_cfg.DATASET.ROOT = '/kaggle/input/mpii-2014'
HRnet_cfg.DATASET.DATASET = 'mpii'
HRnet_cfg.DATASET.TRAIN_SET = 'train'
HRnet_cfg.DATASET.TEST_SET = 'valid'
HRnet_cfg.DATASET.DATA_FORMAT = 'jpg'
HRnet_cfg.DATASET.HYBRID_JOINTS_TYPE = ''
HRnet_cfg.DATASET.SELECT_DATA = False

# training data augmentation
HRnet_cfg.DATASET.COLOR_RGB = True
HRnet_cfg.DATASET.FLIP = True
HRnet_cfg.DATASET.NUM_JOINTS_HALF_BODY = 8
HRnet_cfg.DATASET.PROB_HALF_BODY = -1.0
HRnet_cfg.DATASET.ROT_FACTOR = 30
HRnet_cfg.DATASET.SCALE_FACTOR = 0.25

# train
HRnet_cfg.TRAIN = CN()

HRnet_cfg.TRAIN.LR_FACTOR = 0.1
HRnet_cfg.TRAIN.LR_STEP = [170, 200]
HRnet_cfg.TRAIN.LR = 0.001

HRnet_cfg.TRAIN.OPTIMIZER = 'adam'
HRnet_cfg.TRAIN.MOMENTUM = 0.9
HRnet_cfg.TRAIN.WD = 0.0001
HRnet_cfg.TRAIN.NESTEROV = False
HRnet_cfg.TRAIN.GAMMA1 = 0.99
HRnet_cfg.TRAIN.GAMMA2 = 0.0

HRnet_cfg.TRAIN.BEGIN_EPOCH = 0
HRnet_cfg.TRAIN.END_EPOCH = 100

HRnet_cfg.TRAIN.RESUME = False
HRnet_cfg.TRAIN.CHECKPOINT = ''

HRnet_cfg.TRAIN.BATCH_SIZE_PER_GPU = 32
HRnet_cfg.TRAIN.SHUFFLE = True

# testing
HRnet_cfg.TEST = CN()
HRnet_cfg.TEST.MODEL_FILE = "/kaggle/input/cva-models/hrpose_w32_256x256.pth"
HRnet_cfg.TEST.BATCH_SIZE_PER_GPU = 32
HRnet_cfg.TEST.FLIP_TEST = True
HRnet_cfg.TEST.POST_PROCESS = True
HRnet_cfg.TEST.SHIFT_HEATMAP = True
HRnet_cfg.TEST.USE_GT_BBOX = False


# debug
HRnet_cfg.DEBUG = CN()
HRnet_cfg.DEBUG.DEBUG = False
HRnet_cfg.DEBUG.SAVE_BATCH_IMAGES_GT = False
HRnet_cfg.DEBUG.SAVE_BATCH_IMAGES_PRED = False
HRnet_cfg.DEBUG.SAVE_HEATMAPS_GT = False
HRnet_cfg.DEBUG.SAVE_HEATMAPS_PRED = False

## DarkPose

In [7]:
DarkPose_cfg = CN()

DarkPose_cfg.OUTPUT_DIR = 'output'
DarkPose_cfg.LOG_DIR = 'log'
DarkPose_cfg.DATA_DIR = ''
DarkPose_cfg.GPUS = (0,)  # Updated
DarkPose_cfg.WORKERS = 24
DarkPose_cfg.PRINT_FREQ = 100
DarkPose_cfg.AUTO_RESUME = True  # Updated
DarkPose_cfg.PIN_MEMORY = True
DarkPose_cfg.RANK = 0

# Cudnn related params
DarkPose_cfg.CUDNN = CN()
DarkPose_cfg.CUDNN.BENCHMARK = True  # Updated
DarkPose_cfg.CUDNN.DETERMINISTIC = False  # Updated
DarkPose_cfg.CUDNN.ENABLED = True  # Updated

# common params for NETWORK
DarkPose_cfg.MODEL = CN()
DarkPose_cfg.MODEL.NAME = 'pose_hrnet'
DarkPose_cfg.MODEL.INIT_WEIGHTS = True
DarkPose_cfg.MODEL.PRETRAINED = '/kaggle/input/cva-models/dark_w32_256×256.pth'  # Updated
DarkPose_cfg.MODEL.NUM_JOINTS = 16
DarkPose_cfg.MODEL.TAG_PER_JOINT = True
DarkPose_cfg.MODEL.TARGET_TYPE = 'gaussian'
DarkPose_cfg.MODEL.IMAGE_SIZE = [256, 256]
DarkPose_cfg.MODEL.HEATMAP_SIZE = [64, 64]
DarkPose_cfg.MODEL.SIGMA = 2

DarkPose_cfg.MODEL.EXTRA = CN()
DarkPose_cfg.MODEL.EXTRA.NUM_FEATURES = 256
DarkPose_cfg.MODEL.EXTRA.NUM_STACKS = 8
DarkPose_cfg.MODEL.EXTRA.NUM_BLOCKS = 1
DarkPose_cfg.MODEL.EXTRA.NUM_CLASSES = 16
DarkPose_cfg.MODEL.EXTRA.PRETRAINED_LAYERS = [
  'conv1',
  'bn1',
  'conv2',
  'bn2',
  'layer1',
  'transition1',
  'stage2',
  'transition2',
  'stage3',
  'transition3',
  'stage4'
]
DarkPose_cfg.MODEL.EXTRA.FINAL_CONV_KERNEL = 1

DarkPose_cfg.MODEL.EXTRA.STAGE2 = CN()
DarkPose_cfg.MODEL.EXTRA.STAGE2.NUM_MODULES = 1
DarkPose_cfg.MODEL.EXTRA.STAGE2.NUM_BRANCHES = 2
DarkPose_cfg.MODEL.EXTRA.STAGE2.NUM_BLOCKS = [4, 4]
DarkPose_cfg.MODEL.EXTRA.STAGE2.NUM_CHANNELS = [32, 64]
DarkPose_cfg.MODEL.EXTRA.STAGE2.BLOCK = 'BASIC'
DarkPose_cfg.MODEL.EXTRA.STAGE2.FUSE_METHOD = 'SUM'

DarkPose_cfg.MODEL.EXTRA.STAGE3 = CN()
DarkPose_cfg.MODEL.EXTRA.STAGE3.NUM_MODULES = 4
DarkPose_cfg.MODEL.EXTRA.STAGE3.NUM_BRANCHES = 3
DarkPose_cfg.MODEL.EXTRA.STAGE3.NUM_BLOCKS = [4, 4, 4]
DarkPose_cfg.MODEL.EXTRA.STAGE3.NUM_CHANNELS = [32, 64, 128]
DarkPose_cfg.MODEL.EXTRA.STAGE3.BLOCK = 'BASIC'
DarkPose_cfg.MODEL.EXTRA.STAGE3.FUSE_METHOD = 'SUM'

DarkPose_cfg.MODEL.EXTRA.STAGE4 = CN()
DarkPose_cfg.MODEL.EXTRA.STAGE4.NUM_MODULES = 3
DarkPose_cfg.MODEL.EXTRA.STAGE4.NUM_BRANCHES = 4
DarkPose_cfg.MODEL.EXTRA.STAGE4.NUM_BLOCKS = [4, 4, 4, 4]
DarkPose_cfg.MODEL.EXTRA.STAGE4.NUM_CHANNELS = [32, 64, 128, 256]
DarkPose_cfg.MODEL.EXTRA.STAGE4.BLOCK = 'BASIC'
DarkPose_cfg.MODEL.EXTRA.STAGE4.FUSE_METHOD = 'SUM'

DarkPose_cfg.LOSS = CN()
DarkPose_cfg.LOSS.USE_TARGET_WEIGHT = True
DarkPose_cfg.LOSS.USE_DIFFERENT_JOINTS_WEIGHT = False
# DATASET related params
DarkPose_cfg.DATASET = CN()
DarkPose_cfg.DATASET.ROOT = '/kaggle/input/mpii-2014'  # Updated
DarkPose_cfg.DATASET.DATASET = 'mpii'
DarkPose_cfg.DATASET.TRAIN_SET = 'train'
DarkPose_cfg.DATASET.TEST_SET = 'valid'
DarkPose_cfg.DATASET.DATA_FORMAT = 'jpg'
DarkPose_cfg.DATASET.HYBRID_JOINTS_TYPE = ''
DarkPose_cfg.DATASET.SELECT_DATA = False

# training data augmentation
DarkPose_cfg.DATASET.COLOR_RGB = True  # Updated
DarkPose_cfg.DATASET.FLIP = True  # Updated
DarkPose_cfg.DATASET.NUM_JOINTS_HALF_BODY = 8  # Updated
DarkPose_cfg.DATASET.PROB_HALF_BODY = -1.0  # Updated
DarkPose_cfg.DATASET.ROT_FACTOR = 30  # Updated
DarkPose_cfg.DATASET.SCALE_FACTOR = 0.25  # Updated

# train
DarkPose_cfg.TRAIN = CN()

DarkPose_cfg.TRAIN.LR_FACTOR = 0.1
DarkPose_cfg.TRAIN.LR_STEP = [170, 200]
DarkPose_cfg.TRAIN.LR = 0.001

DarkPose_cfg.TRAIN.OPTIMIZER = 'adam'
DarkPose_cfg.TRAIN.MOMENTUM = 0.9
DarkPose_cfg.TRAIN.WD = 0.0001
DarkPose_cfg.TRAIN.NESTEROV = False
DarkPose_cfg.TRAIN.GAMMA1 = 0.99
DarkPose_cfg.TRAIN.GAMMA2 = 0.0

DarkPose_cfg.TRAIN.BEGIN_EPOCH = 0
DarkPose_cfg.TRAIN.END_EPOCH = 10  # Updated

DarkPose_cfg.TRAIN.RESUME = False
DarkPose_cfg.TRAIN.CHECKPOINT = ''

DarkPose_cfg.TRAIN.BATCH_SIZE_PER_GPU = 32
DarkPose_cfg.TRAIN.SHUFFLE = True

# testing
DarkPose_cfg.TEST = CN()
DarkPose_cfg.TEST.MODEL_FILE = '/kaggle/input/cva-models/dark_w32_256×256.pth'  # Updated
DarkPose_cfg.TEST.BATCH_SIZE_PER_GPU = 32
DarkPose_cfg.TEST.FLIP_TEST = True
DarkPose_cfg.TEST.POST_PROCESS = True
DarkPose_cfg.TEST.BLUR_KERNEL = 11  # Updated
DarkPose_cfg.TEST.SHIFT_HEATMAP = True
DarkPose_cfg.TEST.USE_GT_BBOX = False

# debug
DarkPose_cfg.DEBUG = CN()
DarkPose_cfg.DEBUG.DEBUG = False  # Updated
DarkPose_cfg.DEBUG.SAVE_BATCH_IMAGES_GT = False  # Updated
DarkPose_cfg.DEBUG.SAVE_BATCH_IMAGES_PRED = False  # Updated
DarkPose_cfg.DEBUG.SAVE_HEATMAPS_GT = False  # Updated
DarkPose_cfg.DEBUG.SAVE_HEATMAPS_PRED = False

BN_MOMENTUM = 0.1

# Class Definition

## Resnet

In [8]:
BN_MOMENTUM = 0.1

def res_conv3x3(in_planes, out_planes, stride=1):
    """3x3 convolution with padding"""
    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
                     padding=1, bias=False)


class ResBasicBlock(nn.Module):
    expansion = 1

    def __init__(self, inplanes, planes, stride=1, downsample=None):
        super(ResBasicBlock, self).__init__()
        self.conv1 = res_conv3x3(inplanes, planes, stride)
        self.bn1 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)
        self.relu = nn.ReLU(inplace=True)
        self.conv2 = res_conv3x3(planes, planes)
        self.bn2 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)
        self.downsample = downsample
        self.stride = stride

    def forward(self, x):
        residual = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)

        if self.downsample is not None:
            residual = self.downsample(x)

        out += residual
        out = self.relu(out)

        return out


class ResBottleneck(nn.Module):
    expansion = 4

    def __init__(self, inplanes, planes, stride=1, downsample=None):
        super(ResBottleneck, self).__init__()
        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
        self.bn1 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)
        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride,
                               padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)
        self.conv3 = nn.Conv2d(planes, planes * self.expansion, kernel_size=1,
                               bias=False)
        self.bn3 = nn.BatchNorm2d(planes * self.expansion,
                                  momentum=BN_MOMENTUM)
        self.relu = nn.ReLU(inplace=True)
        self.downsample = downsample
        self.stride = stride

    def forward(self, x):
        residual = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)
        out = self.relu(out)

        out = self.conv3(out)
        out = self.bn3(out)

        if self.downsample is not None:
            residual = self.downsample(x)

        out += residual
        out = self.relu(out)

        return out


class ResBottleneck_CAFFE(nn.Module):
    expansion = 4

    def __init__(self, inplanes, planes, stride=1, downsample=None):
        super(ResBottleneck_CAFFE, self).__init__()
        # add stride to conv1x1
        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, stride=stride, bias=False)
        self.bn1 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)
        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=1,
                               padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)
        self.conv3 = nn.Conv2d(planes, planes * self.expansion, kernel_size=1,
                               bias=False)
        self.bn3 = nn.BatchNorm2d(planes * self.expansion,
                                  momentum=BN_MOMENTUM)
        self.relu = nn.ReLU(inplace=True)
        self.downsample = downsample
        self.stride = stride

    def forward(self, x):
        residual = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)
        out = self.relu(out)

        out = self.conv3(out)
        out = self.bn3(out)

        if self.downsample is not None:
            residual = self.downsample(x)

        out += residual
        out = self.relu(out)

        return out


class PoseResNet(nn.Module):

    def __init__(self, block, layers, cfg, **kwargs):
        self.inplanes = 64
        extra = cfg.MODEL.EXTRA
        self.deconv_with_bias = extra.DECONV_WITH_BIAS

        super(PoseResNet, self).__init__()
        self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3,
                               bias=False)
        self.bn1 = nn.BatchNorm2d(64, momentum=BN_MOMENTUM)
        self.relu = nn.ReLU(inplace=True)
        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
        self.layer1 = self._make_layer(block, 64, layers[0])
        self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
        self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
        self.layer4 = self._make_layer(block, 512, layers[3], stride=2)

        # used for deconv layers
        self.deconv_layers = self._make_deconv_layer(
            extra.NUM_DECONV_LAYERS,
            extra.NUM_DECONV_FILTERS,
            extra.NUM_DECONV_KERNELS,
        )

        self.final_layer = nn.Conv2d(
            in_channels=extra.NUM_DECONV_FILTERS[-1],
            out_channels=cfg.MODEL.NUM_JOINTS,
            kernel_size=extra.FINAL_CONV_KERNEL,
            stride=1,
            padding=1 if extra.FINAL_CONV_KERNEL == 3 else 0
        )

    def _make_layer(self, block, planes, blocks, stride=1):
        downsample = None
        if stride != 1 or self.inplanes != planes * block.expansion:
            downsample = nn.Sequential(
                nn.Conv2d(self.inplanes, planes * block.expansion,
                          kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(planes * block.expansion, momentum=BN_MOMENTUM),
            )

        layers = []
        layers.append(block(self.inplanes, planes, stride, downsample))
        self.inplanes = planes * block.expansion
        for i in range(1, blocks):
            layers.append(block(self.inplanes, planes))

        return nn.Sequential(*layers)

    def _get_deconv_cfg(self, deconv_kernel, index):
        if deconv_kernel == 4:
            padding = 1
            output_padding = 0
        elif deconv_kernel == 3:
            padding = 1
            output_padding = 1
        elif deconv_kernel == 2:
            padding = 0
            output_padding = 0

        return deconv_kernel, padding, output_padding

    def _make_deconv_layer(self, num_layers, num_filters, num_kernels):
        assert num_layers == len(num_filters), \
            'ERROR: num_deconv_layers is different len(num_deconv_filters)'
        assert num_layers == len(num_kernels), \
            'ERROR: num_deconv_layers is different len(num_deconv_filters)'

        layers = []
        for i in range(num_layers):
            kernel, padding, output_padding = \
                self._get_deconv_cfg(num_kernels[i], i)

            planes = num_filters[i]
            layers.append(
                nn.ConvTranspose2d(
                    in_channels=self.inplanes,
                    out_channels=planes,
                    kernel_size=kernel,
                    stride=2,
                    padding=padding,
                    output_padding=output_padding,
                    bias=self.deconv_with_bias))
            layers.append(nn.BatchNorm2d(planes, momentum=BN_MOMENTUM))
            layers.append(nn.ReLU(inplace=True))
            self.inplanes = planes

        return nn.Sequential(*layers)

    def forward(self, x):
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.maxpool(x)

        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)

        x = self.deconv_layers(x)
        x = self.final_layer(x)

        return x

    def init_weights(self, pretrained=''):
        if os.path.isfile(pretrained):
            print('Load pretrained model successfully!')
            for name, m in self.deconv_layers.named_modules():
                if isinstance(m, nn.ConvTranspose2d):
                    nn.init.normal_(m.weight, std=0.001)
                    if self.deconv_with_bias:
                        nn.init.constant_(m.bias, 0)
                elif isinstance(m, nn.BatchNorm2d):
                    nn.init.constant_(m.weight, 1)
                    nn.init.constant_(m.bias, 0)
            for m in self.final_layer.modules():
                if isinstance(m, nn.Conv2d):
                    # nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
                    nn.init.normal_(m.weight, std=0.001)
                    nn.init.constant_(m.bias, 0)

            # pretrained_state_dict = torch.load(pretrained)
            # self.load_state_dict(pretrained_state_dict, strict=False)
            checkpoint = torch.load(pretrained)
            if isinstance(checkpoint, OrderedDict):
                state_dict = checkpoint
            elif isinstance(checkpoint, dict) and 'state_dict' in checkpoint:
                state_dict_old = checkpoint['state_dict']
                state_dict = OrderedDict()
                # delete 'module.' because it is saved from DataParallel module
                for key in state_dict_old.keys():
                    if key.startswith('module.'):
                        # state_dict[key[7:]] = state_dict[key]
                        # state_dict.pop(key)
                        state_dict[key[7:]] = state_dict_old[key]
                    else:
                        state_dict[key] = state_dict_old[key]
            else:
                raise RuntimeError(
                    'No state_dict found in checkpoint file {}'.format(pretrained))
            self.load_state_dict(state_dict, strict=False)
        else:
            raise ValueError('imagenet pretrained model does not exist')


resnet_spec = {50: (ResBottleneck, [3, 4, 6, 3])}


def get_res_pose_net(cfg, is_train, **kwargs):
    num_layers = cfg.MODEL.EXTRA.NUM_LAYERS
    style = cfg.MODEL.STYLE

    block_class, layers = resnet_spec[num_layers]

    if style == 'caffe':
        block_class = ResBottleneck_CAFFE

    model = PoseResNet(block_class, layers, cfg, **kwargs)

    if is_train and cfg.MODEL.INIT_WEIGHTS:
        model.init_weights(cfg.MODEL.PRETRAINED)

    return model

## HRnet

In [9]:
logger = logging.getLogger(__name__)

def conv3x3(in_planes, out_planes, stride=1):
    """3x3 convolution with padding"""
    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
                     padding=1, bias=False)


class BasicBlock(nn.Module):
    expansion = 1

    def __init__(self, inplanes, planes, stride=1, downsample=None):
        super(BasicBlock, self).__init__()
        self.conv1 = conv3x3(inplanes, planes, stride)
        self.bn1 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)
        self.relu = nn.ReLU(inplace=True)
        self.conv2 = conv3x3(planes, planes)
        self.bn2 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)
        self.downsample = downsample
        self.stride = stride

    def forward(self, x):
        residual = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)

        if self.downsample is not None:
            residual = self.downsample(x)

        out += residual
        out = self.relu(out)

        return out


class Bottleneck(nn.Module):
    expansion = 4

    def __init__(self, inplanes, planes, stride=1, downsample=None):
        super(Bottleneck, self).__init__()
        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
        self.bn1 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)
        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride,
                               padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)
        self.conv3 = nn.Conv2d(planes, planes * self.expansion, kernel_size=1,
                               bias=False)
        self.bn3 = nn.BatchNorm2d(planes * self.expansion,
                                  momentum=BN_MOMENTUM)
        self.relu = nn.ReLU(inplace=True)
        self.downsample = downsample
        self.stride = stride

    def forward(self, x):
        residual = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)
        out = self.relu(out)

        out = self.conv3(out)
        out = self.bn3(out)

        if self.downsample is not None:
            residual = self.downsample(x)

        out += residual
        out = self.relu(out)

        return out


class HighResolutionModule(nn.Module):
    def __init__(self, num_branches, blocks, num_blocks, num_inchannels,
                 num_channels, fuse_method, multi_scale_output=True):
        super(HighResolutionModule, self).__init__()
        self._check_branches(
            num_branches, blocks, num_blocks, num_inchannels, num_channels)

        self.num_inchannels = num_inchannels
        self.fuse_method = fuse_method
        self.num_branches = num_branches

        self.multi_scale_output = multi_scale_output

        self.branches = self._make_branches(
            num_branches, blocks, num_blocks, num_channels)
        self.fuse_layers = self._make_fuse_layers()
        self.relu = nn.ReLU(True)

    def _check_branches(self, num_branches, blocks, num_blocks,
                        num_inchannels, num_channels):
        if num_branches != len(num_blocks):
            error_msg = 'NUM_BRANCHES({}) <> NUM_BLOCKS({})'.format(
                num_branches, len(num_blocks))
            logger.error(error_msg)
            raise ValueError(error_msg)

        if num_branches != len(num_channels):
            error_msg = 'NUM_BRANCHES({}) <> NUM_CHANNELS({})'.format(
                num_branches, len(num_channels))
            logger.error(error_msg)
            raise ValueError(error_msg)

        if num_branches != len(num_inchannels):
            error_msg = 'NUM_BRANCHES({}) <> NUM_INCHANNELS({})'.format(
                num_branches, len(num_inchannels))
            logger.error(error_msg)
            raise ValueError(error_msg)

    def _make_one_branch(self, branch_index, block, num_blocks, num_channels,
                         stride=1):
        downsample = None
        if stride != 1 or \
           self.num_inchannels[branch_index] != num_channels[branch_index] * block.expansion:
            downsample = nn.Sequential(
                nn.Conv2d(
                    self.num_inchannels[branch_index],
                    num_channels[branch_index] * block.expansion,
                    kernel_size=1, stride=stride, bias=False
                ),
                nn.BatchNorm2d(
                    num_channels[branch_index] * block.expansion,
                    momentum=BN_MOMENTUM
                ),
            )

        layers = []
        layers.append(
            block(
                self.num_inchannels[branch_index],
                num_channels[branch_index],
                stride,
                downsample
            )
        )
        self.num_inchannels[branch_index] = \
            num_channels[branch_index] * block.expansion
        for i in range(1, num_blocks[branch_index]):
            layers.append(
                block(
                    self.num_inchannels[branch_index],
                    num_channels[branch_index]
                )
            )

        return nn.Sequential(*layers)

    def _make_branches(self, num_branches, block, num_blocks, num_channels):
        branches = []

        for i in range(num_branches):
            branches.append(
                self._make_one_branch(i, block, num_blocks, num_channels)
            )

        return nn.ModuleList(branches)

    def _make_fuse_layers(self):
        if self.num_branches == 1:
            return None

        num_branches = self.num_branches
        num_inchannels = self.num_inchannels
        fuse_layers = []
        for i in range(num_branches if self.multi_scale_output else 1):
            fuse_layer = []
            for j in range(num_branches):
                if j > i:
                    fuse_layer.append(
                        nn.Sequential(
                            nn.Conv2d(
                                num_inchannels[j],
                                num_inchannels[i],
                                1, 1, 0, bias=False
                            ),
                            nn.BatchNorm2d(num_inchannels[i]),
                            nn.Upsample(scale_factor=2**(j-i), mode='nearest')
                        )
                    )
                elif j == i:
                    fuse_layer.append(None)
                else:
                    conv3x3s = []
                    for k in range(i-j):
                        if k == i - j - 1:
                            num_outchannels_conv3x3 = num_inchannels[i]
                            conv3x3s.append(
                                nn.Sequential(
                                    nn.Conv2d(
                                        num_inchannels[j],
                                        num_outchannels_conv3x3,
                                        3, 2, 1, bias=False
                                    ),
                                    nn.BatchNorm2d(num_outchannels_conv3x3)
                                )
                            )
                        else:
                            num_outchannels_conv3x3 = num_inchannels[j]
                            conv3x3s.append(
                                nn.Sequential(
                                    nn.Conv2d(
                                        num_inchannels[j],
                                        num_outchannels_conv3x3,
                                        3, 2, 1, bias=False
                                    ),
                                    nn.BatchNorm2d(num_outchannels_conv3x3),
                                    nn.ReLU(True)
                                )
                            )
                    fuse_layer.append(nn.Sequential(*conv3x3s))
            fuse_layers.append(nn.ModuleList(fuse_layer))

        return nn.ModuleList(fuse_layers)

    def get_num_inchannels(self):
        return self.num_inchannels

    def forward(self, x):
        if self.num_branches == 1:
            return [self.branches[0](x[0])]

        for i in range(self.num_branches):
            x[i] = self.branches[i](x[i])

        x_fuse = []

        for i in range(len(self.fuse_layers)):
            y = x[0] if i == 0 else self.fuse_layers[i][0](x[0])
            for j in range(1, self.num_branches):
                if i == j:
                    y = y + x[j]
                else:
                    y = y + self.fuse_layers[i][j](x[j])
            x_fuse.append(self.relu(y))

        return x_fuse


blocks_dict = {
    'BASIC': BasicBlock,
    'BOTTLENECK': Bottleneck
}


class PoseHighResolutionNet(nn.Module):

    def __init__(self, cfg, **kwargs):
        self.inplanes = 64
        extra = cfg['MODEL']['EXTRA']
        super(PoseHighResolutionNet, self).__init__()

        # stem net
        self.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=2, padding=1,
                               bias=False)
        self.bn1 = nn.BatchNorm2d(64, momentum=BN_MOMENTUM)
        self.conv2 = nn.Conv2d(64, 64, kernel_size=3, stride=2, padding=1,
                               bias=False)
        self.bn2 = nn.BatchNorm2d(64, momentum=BN_MOMENTUM)
        self.relu = nn.ReLU(inplace=True)
        self.layer1 = self._make_layer(Bottleneck, 64, 4)

        self.stage2_cfg = extra['STAGE2']
        num_channels = self.stage2_cfg['NUM_CHANNELS']
        block = blocks_dict[self.stage2_cfg['BLOCK']]
        num_channels = [
            num_channels[i] * block.expansion for i in range(len(num_channels))
        ]
        self.transition1 = self._make_transition_layer([256], num_channels)
        self.stage2, pre_stage_channels = self._make_stage(
            self.stage2_cfg, num_channels)

        self.stage3_cfg = extra['STAGE3']
        num_channels = self.stage3_cfg['NUM_CHANNELS']
        block = blocks_dict[self.stage3_cfg['BLOCK']]
        num_channels = [
            num_channels[i] * block.expansion for i in range(len(num_channels))
        ]
        self.transition2 = self._make_transition_layer(
            pre_stage_channels, num_channels)
        self.stage3, pre_stage_channels = self._make_stage(
            self.stage3_cfg, num_channels)

        self.stage4_cfg = extra['STAGE4']
        num_channels = self.stage4_cfg['NUM_CHANNELS']
        block = blocks_dict[self.stage4_cfg['BLOCK']]
        num_channels = [
            num_channels[i] * block.expansion for i in range(len(num_channels))
        ]
        self.transition3 = self._make_transition_layer(
            pre_stage_channels, num_channels)
        self.stage4, pre_stage_channels = self._make_stage(
            self.stage4_cfg, num_channels, multi_scale_output=False)

        self.final_layer = nn.Conv2d(
            in_channels=pre_stage_channels[0],
            out_channels=cfg['MODEL']['NUM_JOINTS'],
            kernel_size=extra['FINAL_CONV_KERNEL'],
            stride=1,
            padding=1 if extra['FINAL_CONV_KERNEL'] == 3 else 0
        )

        self.pretrained_layers = extra['PRETRAINED_LAYERS']

    def _make_transition_layer(
            self, num_channels_pre_layer, num_channels_cur_layer):
        num_branches_cur = len(num_channels_cur_layer)
        num_branches_pre = len(num_channels_pre_layer)

        transition_layers = []
        for i in range(num_branches_cur):
            if i < num_branches_pre:
                if num_channels_cur_layer[i] != num_channels_pre_layer[i]:
                    transition_layers.append(
                        nn.Sequential(
                            nn.Conv2d(
                                num_channels_pre_layer[i],
                                num_channels_cur_layer[i],
                                3, 1, 1, bias=False
                            ),
                            nn.BatchNorm2d(num_channels_cur_layer[i]),
                            nn.ReLU(inplace=True)
                        )
                    )
                else:
                    transition_layers.append(None)
            else:
                conv3x3s = []
                for j in range(i+1-num_branches_pre):
                    inchannels = num_channels_pre_layer[-1]
                    outchannels = num_channels_cur_layer[i] \
                        if j == i-num_branches_pre else inchannels
                    conv3x3s.append(
                        nn.Sequential(
                            nn.Conv2d(
                                inchannels, outchannels, 3, 2, 1, bias=False
                            ),
                            nn.BatchNorm2d(outchannels),
                            nn.ReLU(inplace=True)
                        )
                    )
                transition_layers.append(nn.Sequential(*conv3x3s))

        return nn.ModuleList(transition_layers)

    def _make_layer(self, block, planes, blocks, stride=1):
        downsample = None
        if stride != 1 or self.inplanes != planes * block.expansion:
            downsample = nn.Sequential(
                nn.Conv2d(
                    self.inplanes, planes * block.expansion,
                    kernel_size=1, stride=stride, bias=False
                ),
                nn.BatchNorm2d(planes * block.expansion, momentum=BN_MOMENTUM),
            )

        layers = []
        layers.append(block(self.inplanes, planes, stride, downsample))
        self.inplanes = planes * block.expansion
        for i in range(1, blocks):
            layers.append(block(self.inplanes, planes))

        return nn.Sequential(*layers)

    def _make_stage(self, layer_config, num_inchannels,
                    multi_scale_output=True):
        num_modules = layer_config['NUM_MODULES']
        num_branches = layer_config['NUM_BRANCHES']
        num_blocks = layer_config['NUM_BLOCKS']
        num_channels = layer_config['NUM_CHANNELS']
        block = blocks_dict[layer_config['BLOCK']]
        fuse_method = layer_config['FUSE_METHOD']

        modules = []
        for i in range(num_modules):
            # multi_scale_output is only used last module
            if not multi_scale_output and i == num_modules - 1:
                reset_multi_scale_output = False
            else:
                reset_multi_scale_output = True

            modules.append(
                HighResolutionModule(
                    num_branches,
                    block,
                    num_blocks,
                    num_inchannels,
                    num_channels,
                    fuse_method,
                    reset_multi_scale_output
                )
            )
            num_inchannels = modules[-1].get_num_inchannels()

        return nn.Sequential(*modules), num_inchannels

    def forward(self, x):
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.conv2(x)
        x = self.bn2(x)
        x = self.relu(x)
        x = self.layer1(x)

        x_list = []
        for i in range(self.stage2_cfg['NUM_BRANCHES']):
            if self.transition1[i] is not None:
                x_list.append(self.transition1[i](x))
            else:
                x_list.append(x)
        y_list = self.stage2(x_list)

        x_list = []
        for i in range(self.stage3_cfg['NUM_BRANCHES']):
            if self.transition2[i] is not None:
                x_list.append(self.transition2[i](y_list[-1]))
            else:
                x_list.append(y_list[i])
        y_list = self.stage3(x_list)

        x_list = []
        for i in range(self.stage4_cfg['NUM_BRANCHES']):
            if self.transition3[i] is not None:
                x_list.append(self.transition3[i](y_list[-1]))
            else:
                x_list.append(y_list[i])
        y_list = self.stage4(x_list)

        x = self.final_layer(y_list[0])

        return x

    def init_weights(self, pretrained=''):
        logger.info('=> init weights from normal distribution')
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                # nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
                nn.init.normal_(m.weight, std=0.001)
                for name, _ in m.named_parameters():
                    if name in ['bias']:
                        nn.init.constant_(m.bias, 0)
            elif isinstance(m, nn.BatchNorm2d):
                nn.init.constant_(m.weight, 1)
                nn.init.constant_(m.bias, 0)
            elif isinstance(m, nn.ConvTranspose2d):
                nn.init.normal_(m.weight, std=0.001)
                for name, _ in m.named_parameters():
                    if name in ['bias']:
                        nn.init.constant_(m.bias, 0)

        if os.path.isfile(pretrained):
            pretrained_state_dict = torch.load(pretrained)
            logger.info('=> loading pretrained model {}'.format(pretrained))

            need_init_state_dict = {}
            for name, m in pretrained_state_dict.items():
                if name.split('.')[0] in self.pretrained_layers \
                   or self.pretrained_layers[0] is '*':
                    need_init_state_dict[name] = m
            self.load_state_dict(need_init_state_dict, strict=False)
        elif pretrained:
            logger.error('=> please download pre-trained models first!')
            raise ValueError('{} is not exist!'.format(pretrained))


def get_pose_net(cfg, is_train, **kwargs):
    model = PoseHighResolutionNet(cfg, **kwargs)

    if is_train and cfg['MODEL']['INIT_WEIGHTS']:
        model.init_weights(cfg['MODEL']['PRETRAINED'])

    return model

  or self.pretrained_layers[0] is '*':


## DarkPose

In [10]:
darklogger = logging.getLogger(__name__)

def conv3x3(in_planes, out_planes, stride=1):
    """3x3 convolution with padding"""
    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
                     padding=1, bias=False)


class DarkBasicBlock(nn.Module):
    expansion = 1

    def __init__(self, inplanes, planes, stride=1, downsample=None):
        super(DarkBasicBlock, self).__init__()
        self.conv1 = conv3x3(inplanes, planes, stride)
        self.bn1 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)
        self.relu = nn.ReLU(inplace=True)
        self.conv2 = conv3x3(planes, planes)
        self.bn2 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)
        self.downsample = downsample
        self.stride = stride

    def forward(self, x):
        residual = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)

        if self.downsample is not None:
            residual = self.downsample(x)

        out += residual
        out = self.relu(out)

        return out


class DarkBottleneck(nn.Module):
    expansion = 4

    def __init__(self, inplanes, planes, stride=1, downsample=None):
        super(DarkBottleneck, self).__init__()
        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
        self.bn1 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)
        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride,
                               padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)
        self.conv3 = nn.Conv2d(planes, planes * self.expansion, kernel_size=1,
                               bias=False)
        self.bn3 = nn.BatchNorm2d(planes * self.expansion,
                                  momentum=BN_MOMENTUM)
        self.relu = nn.ReLU(inplace=True)
        self.downsample = downsample
        self.stride = stride

    def forward(self, x):
        residual = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)
        out = self.relu(out)

        out = self.conv3(out)
        out = self.bn3(out)

        if self.downsample is not None:
            residual = self.downsample(x)

        out += residual
        out = self.relu(out)

        return out


class DarkHighResolutionModule(nn.Module):
    def __init__(self, num_branches, blocks, num_blocks, num_inchannels,
                 num_channels, fuse_method, multi_scale_output=True):
        super(DarkHighResolutionModule, self).__init__()
        self._check_branches(
            num_branches, blocks, num_blocks, num_inchannels, num_channels)

        self.num_inchannels = num_inchannels
        self.fuse_method = fuse_method
        self.num_branches = num_branches

        self.multi_scale_output = multi_scale_output

        self.branches = self._make_branches(
            num_branches, blocks, num_blocks, num_channels)
        self.fuse_layers = self._make_fuse_layers()
        self.relu = nn.ReLU(True)

    def _check_branches(self, num_branches, blocks, num_blocks,
                        num_inchannels, num_channels):
        if num_branches != len(num_blocks):
            error_msg = 'NUM_BRANCHES({}) <> NUM_BLOCKS({})'.format(
                num_branches, len(num_blocks))
            darklogger.error(error_msg)
            raise ValueError(error_msg)

        if num_branches != len(num_channels):
            error_msg = 'NUM_BRANCHES({}) <> NUM_CHANNELS({})'.format(
                num_branches, len(num_channels))
            darklogger.error(error_msg)
            raise ValueError(error_msg)

        if num_branches != len(num_inchannels):
            error_msg = 'NUM_BRANCHES({}) <> NUM_INCHANNELS({})'.format(
                num_branches, len(num_inchannels))
            darklogger.error(error_msg)
            raise ValueError(error_msg)

    def _make_one_branch(self, branch_index, block, num_blocks, num_channels,
                         stride=1):
        downsample = None
        if stride != 1 or \
           self.num_inchannels[branch_index] != num_channels[branch_index] * block.expansion:
            downsample = nn.Sequential(
                nn.Conv2d(
                    self.num_inchannels[branch_index],
                    num_channels[branch_index] * block.expansion,
                    kernel_size=1, stride=stride, bias=False
                ),
                nn.BatchNorm2d(
                    num_channels[branch_index] * block.expansion,
                    momentum=BN_MOMENTUM
                ),
            )

        layers = []
        layers.append(
            block(
                self.num_inchannels[branch_index],
                num_channels[branch_index],
                stride,
                downsample
            )
        )
        self.num_inchannels[branch_index] = \
            num_channels[branch_index] * block.expansion
        for i in range(1, num_blocks[branch_index]):
            layers.append(
                block(
                    self.num_inchannels[branch_index],
                    num_channels[branch_index]
                )
            )

        return nn.Sequential(*layers)

    def _make_branches(self, num_branches, block, num_blocks, num_channels):
        branches = []

        for i in range(num_branches):
            branches.append(
                self._make_one_branch(i, block, num_blocks, num_channels)
            )

        return nn.ModuleList(branches)

    def _make_fuse_layers(self):
        if self.num_branches == 1:
            return None

        num_branches = self.num_branches
        num_inchannels = self.num_inchannels
        fuse_layers = []
        for i in range(num_branches if self.multi_scale_output else 1):
            fuse_layer = []
            for j in range(num_branches):
                if j > i:
                    fuse_layer.append(
                        nn.Sequential(
                            nn.Conv2d(
                                num_inchannels[j],
                                num_inchannels[i],
                                1, 1, 0, bias=False
                            ),
                            nn.BatchNorm2d(num_inchannels[i]),
                            nn.Upsample(scale_factor=2**(j-i), mode='nearest')
                        )
                    )
                elif j == i:
                    fuse_layer.append(None)
                else:
                    conv3x3s = []
                    for k in range(i-j):
                        if k == i - j - 1:
                            num_outchannels_conv3x3 = num_inchannels[i]
                            conv3x3s.append(
                                nn.Sequential(
                                    nn.Conv2d(
                                        num_inchannels[j],
                                        num_outchannels_conv3x3,
                                        3, 2, 1, bias=False
                                    ),
                                    nn.BatchNorm2d(num_outchannels_conv3x3)
                                )
                            )
                        else:
                            num_outchannels_conv3x3 = num_inchannels[j]
                            conv3x3s.append(
                                nn.Sequential(
                                    nn.Conv2d(
                                        num_inchannels[j],
                                        num_outchannels_conv3x3,
                                        3, 2, 1, bias=False
                                    ),
                                    nn.BatchNorm2d(num_outchannels_conv3x3),
                                    nn.ReLU(True)
                                )
                            )
                    fuse_layer.append(nn.Sequential(*conv3x3s))
            fuse_layers.append(nn.ModuleList(fuse_layer))

        return nn.ModuleList(fuse_layers)

    def get_num_inchannels(self):
        return self.num_inchannels

    def forward(self, x):
        if self.num_branches == 1:
            return [self.branches[0](x[0])]

        for i in range(self.num_branches):
            x[i] = self.branches[i](x[i])

        x_fuse = []

        for i in range(len(self.fuse_layers)):
            y = x[0] if i == 0 else self.fuse_layers[i][0](x[0])
            for j in range(1, self.num_branches):
                if i == j:
                    y = y + x[j]
                else:
                    y = y + self.fuse_layers[i][j](x[j])
            x_fuse.append(self.relu(y))

        return x_fuse


blocks_dict = {
    'BASIC': DarkBasicBlock,
    'BOTTLENECK': DarkBottleneck
}


class DarkPoseHighResolutionNet(nn.Module):

    def __init__(self, cfg, **kwargs):
        self.inplanes = 64
        extra = cfg.MODEL.EXTRA
        super(DarkPoseHighResolutionNet, self).__init__()

        # stem net
        self.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=2, padding=1,
                               bias=False)
        self.bn1 = nn.BatchNorm2d(64, momentum=BN_MOMENTUM)
        self.conv2 = nn.Conv2d(64, 64, kernel_size=3, stride=2, padding=1,
                               bias=False)
        self.bn2 = nn.BatchNorm2d(64, momentum=BN_MOMENTUM)
        self.relu = nn.ReLU(inplace=True)
        self.layer1 = self._make_layer(DarkBottleneck, 64, 4)

        self.stage2_cfg = cfg['MODEL']['EXTRA']['STAGE2']
        num_channels = self.stage2_cfg['NUM_CHANNELS']
        block = blocks_dict[self.stage2_cfg['BLOCK']]
        num_channels = [
            num_channels[i] * block.expansion for i in range(len(num_channels))
        ]
        self.transition1 = self._make_transition_layer([256], num_channels)
        self.stage2, pre_stage_channels = self._make_stage(
            self.stage2_cfg, num_channels)

        self.stage3_cfg = cfg['MODEL']['EXTRA']['STAGE3']
        num_channels = self.stage3_cfg['NUM_CHANNELS']
        block = blocks_dict[self.stage3_cfg['BLOCK']]
        num_channels = [
            num_channels[i] * block.expansion for i in range(len(num_channels))
        ]
        self.transition2 = self._make_transition_layer(
            pre_stage_channels, num_channels)
        self.stage3, pre_stage_channels = self._make_stage(
            self.stage3_cfg, num_channels)

        self.stage4_cfg = cfg['MODEL']['EXTRA']['STAGE4']
        num_channels = self.stage4_cfg['NUM_CHANNELS']
        block = blocks_dict[self.stage4_cfg['BLOCK']]
        num_channels = [
            num_channels[i] * block.expansion for i in range(len(num_channels))
        ]
        self.transition3 = self._make_transition_layer(
            pre_stage_channels, num_channels)
        self.stage4, pre_stage_channels = self._make_stage(
            self.stage4_cfg, num_channels, multi_scale_output=False)

        self.final_layer = nn.Conv2d(
            in_channels=pre_stage_channels[0],
            out_channels=cfg.MODEL.NUM_JOINTS,
            kernel_size=extra.FINAL_CONV_KERNEL,
            stride=1,
            padding=1 if extra.FINAL_CONV_KERNEL == 3 else 0
        )

        self.pretrained_layers = cfg['MODEL']['EXTRA']['PRETRAINED_LAYERS']

    def _make_transition_layer(
            self, num_channels_pre_layer, num_channels_cur_layer):
        num_branches_cur = len(num_channels_cur_layer)
        num_branches_pre = len(num_channels_pre_layer)

        transition_layers = []
        for i in range(num_branches_cur):
            if i < num_branches_pre:
                if num_channels_cur_layer[i] != num_channels_pre_layer[i]:
                    transition_layers.append(
                        nn.Sequential(
                            nn.Conv2d(
                                num_channels_pre_layer[i],
                                num_channels_cur_layer[i],
                                3, 1, 1, bias=False
                            ),
                            nn.BatchNorm2d(num_channels_cur_layer[i]),
                            nn.ReLU(inplace=True)
                        )
                    )
                else:
                    transition_layers.append(None)
            else:
                conv3x3s = []
                for j in range(i+1-num_branches_pre):
                    inchannels = num_channels_pre_layer[-1]
                    outchannels = num_channels_cur_layer[i] \
                        if j == i-num_branches_pre else inchannels
                    conv3x3s.append(
                        nn.Sequential(
                            nn.Conv2d(
                                inchannels, outchannels, 3, 2, 1, bias=False
                            ),
                            nn.BatchNorm2d(outchannels),
                            nn.ReLU(inplace=True)
                        )
                    )
                transition_layers.append(nn.Sequential(*conv3x3s))

        return nn.ModuleList(transition_layers)

    def _make_layer(self, block, planes, blocks, stride=1):
        downsample = None
        if stride != 1 or self.inplanes != planes * block.expansion:
            downsample = nn.Sequential(
                nn.Conv2d(
                    self.inplanes, planes * block.expansion,
                    kernel_size=1, stride=stride, bias=False
                ),
                nn.BatchNorm2d(planes * block.expansion, momentum=BN_MOMENTUM),
            )

        layers = []
        layers.append(block(self.inplanes, planes, stride, downsample))
        self.inplanes = planes * block.expansion
        for i in range(1, blocks):
            layers.append(block(self.inplanes, planes))

        return nn.Sequential(*layers)

    def _make_stage(self, layer_config, num_inchannels,
                    multi_scale_output=True):
        num_modules = layer_config['NUM_MODULES']
        num_branches = layer_config['NUM_BRANCHES']
        num_blocks = layer_config['NUM_BLOCKS']
        num_channels = layer_config['NUM_CHANNELS']
        block = blocks_dict[layer_config['BLOCK']]
        fuse_method = layer_config['FUSE_METHOD']

        modules = []
        for i in range(num_modules):
            # multi_scale_output is only used last module
            if not multi_scale_output and i == num_modules - 1:
                reset_multi_scale_output = False
            else:
                reset_multi_scale_output = True

            modules.append(
                DarkHighResolutionModule(
                    num_branches,
                    block,
                    num_blocks,
                    num_inchannels,
                    num_channels,
                    fuse_method,
                    reset_multi_scale_output
                )
            )
            num_inchannels = modules[-1].get_num_inchannels()

        return nn.Sequential(*modules), num_inchannels

    def forward(self, x):
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.conv2(x)
        x = self.bn2(x)
        x = self.relu(x)
        x = self.layer1(x)

        x_list = []
        for i in range(self.stage2_cfg['NUM_BRANCHES']):
            if self.transition1[i] is not None:
                x_list.append(self.transition1[i](x))
            else:
                x_list.append(x)
        y_list = self.stage2(x_list)

        x_list = []
        for i in range(self.stage3_cfg['NUM_BRANCHES']):
            if self.transition2[i] is not None:
                x_list.append(self.transition2[i](y_list[-1]))
            else:
                x_list.append(y_list[i])
        y_list = self.stage3(x_list)

        x_list = []
        for i in range(self.stage4_cfg['NUM_BRANCHES']):
            if self.transition3[i] is not None:
                x_list.append(self.transition3[i](y_list[-1]))
            else:
                x_list.append(y_list[i])
        y_list = self.stage4(x_list)

        x = self.final_layer(y_list[0])

        return x

    def init_weights(self, pretrained=''):
        darklogger.info('=> init weights from normal distribution')
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                # nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
                nn.init.normal_(m.weight, std=0.001)
                for name, _ in m.named_parameters():
                    if name in ['bias']:
                        nn.init.constant_(m.bias, 0)
            elif isinstance(m, nn.BatchNorm2d):
                nn.init.constant_(m.weight, 1)
                nn.init.constant_(m.bias, 0)
            elif isinstance(m, nn.ConvTranspose2d):
                nn.init.normal_(m.weight, std=0.001)
                for name, _ in m.named_parameters():
                    if name in ['bias']:
                        nn.init.constant_(m.bias, 0)

        if os.path.isfile(pretrained):
            pretrained_state_dict = torch.load(pretrained)
            darklogger.info('=> loading pretrained model {}'.format(pretrained))

            need_init_state_dict = {}
            for name, m in pretrained_state_dict.items():
                if name.split('.')[0] in self.pretrained_layers \
                   or self.pretrained_layers[0] is '*':
                    need_init_state_dict[name] = m
            self.load_state_dict(need_init_state_dict, strict=False)
        elif pretrained:
            darklogger.error('=> please download pre-trained models first!')
            raise ValueError('{} is not exist!'.format(pretrained))


def get_dark_pose_net(cfg, is_train, **kwargs):
    model = DarkPoseHighResolutionNet(cfg, **kwargs)

    if is_train and cfg.MODEL.INIT_WEIGHTS:
        model.init_weights(cfg.MODEL.PRETRAINED)

    return model

  or self.pretrained_layers[0] is '*':


# Function definition

In [11]:
def dark_transform_preds(coords, center, scale, output_size):
    target_coords = np.zeros(coords.shape)
    trans = dark_get_affine_transform(center, scale, 0, output_size, inv=1)
    for p in range(coords.shape[0]):
        target_coords[p, 0:2] = dark_affine_transform(coords[p, 0:2], trans)
    return target_coords


def dark_get_affine_transform(
        center, scale, rot, output_size,
        shift=np.array([0, 0], dtype=np.float32), inv=0
):
    if not isinstance(scale, np.ndarray) and not isinstance(scale, list):
        print(scale)
        scale = np.array([scale, scale])

    scale_tmp = scale * 200.0
    src_w = scale_tmp[0]
    dst_w = output_size[0]
    dst_h = output_size[1]

    rot_rad = np.pi * rot / 180

    src_dir = dark_get_dir([0, (src_w-1) * -0.5], rot_rad)
    dst_dir = np.array([0, (dst_w-1) * -0.5], np.float32)
    src = np.zeros((3, 2), dtype=np.float32)
    dst = np.zeros((3, 2), dtype=np.float32)
    src[0, :] = center + scale_tmp * shift
    src[1, :] = center + src_dir + scale_tmp * shift
    dst[0, :] = [(dst_w-1) * 0.5, (dst_h-1) * 0.5]
    dst[1, :] = np.array([(dst_w-1) * 0.5, (dst_h-1) * 0.5]) + dst_dir

    src[2:, :] = dark_get_3rd_point(src[0, :], src[1, :])
    dst[2:, :] = dark_get_3rd_point(dst[0, :], dst[1, :])

    if inv:
        trans = cv2.getAffineTransform(np.float32(dst), np.float32(src))
    else:
        trans = cv2.getAffineTransform(np.float32(src), np.float32(dst))

    return trans


def dark_affine_transform(pt, t):
    new_pt = np.array([pt[0], pt[1], 1.]).T
    new_pt = np.dot(t, new_pt)
    return new_pt[:2]


def dark_get_3rd_point(a, b):
    direct = a - b
    return b + np.array([-direct[1], direct[0]], dtype=np.float32)


def dark_get_dir(src_point, rot_rad):
    sn, cs = np.sin(rot_rad), np.cos(rot_rad)

    src_result = [0, 0]
    src_result[0] = src_point[0] * cs - src_point[1] * sn
    src_result[1] = src_point[0] * sn + src_point[1] * cs

    return src_result

def dark_get_max_preds(batch_heatmaps):
    '''
    get predictions from score maps
    heatmaps: numpy.ndarray([batch_size, num_joints, height, width])
    '''
    assert isinstance(batch_heatmaps, np.ndarray), \
        'batch_heatmaps should be numpy.ndarray'
    assert batch_heatmaps.ndim == 4, 'batch_images should be 4-ndim'

    batch_size = batch_heatmaps.shape[0]
    num_joints = batch_heatmaps.shape[1]
    width = batch_heatmaps.shape[3]
    heatmaps_reshaped = batch_heatmaps.reshape((batch_size, num_joints, -1))
    idx = np.argmax(heatmaps_reshaped, 2)
    maxvals = np.amax(heatmaps_reshaped, 2)

    maxvals = maxvals.reshape((batch_size, num_joints, 1))
    idx = idx.reshape((batch_size, num_joints, 1))

    preds = np.tile(idx, (1, 1, 2)).astype(np.float32)

    preds[:, :, 0] = (preds[:, :, 0]) % width
    preds[:, :, 1] = np.floor((preds[:, :, 1]) / width)

    pred_mask = np.tile(np.greater(maxvals, 0.0), (1, 1, 2))
    pred_mask = pred_mask.astype(np.float32)

    preds *= pred_mask
    return preds, maxvals


def dark_taylor(hm, coord):
    heatmap_height = hm.shape[0]
    heatmap_width = hm.shape[1]
    px = int(coord[0])
    py = int(coord[1])
    if 1 < px < heatmap_width-2 and 1 < py < heatmap_height-2:
        dx  = 0.5 * (hm[py][px+1] - hm[py][px-1])
        dy  = 0.5 * (hm[py+1][px] - hm[py-1][px])
        dxx = 0.25 * (hm[py][px+2] - 2 * hm[py][px] + hm[py][px-2])
        dxy = 0.25 * (hm[py+1][px+1] - hm[py-1][px+1] - hm[py+1][px-1] \
            + hm[py-1][px-1])
        dyy = 0.25 * (hm[py+2*1][px] - 2 * hm[py][px] + hm[py-2*1][px])
        derivative = np.matrix([[dx],[dy]])
        hessian = np.matrix([[dxx,dxy],[dxy,dyy]])
        if dxx * dyy - dxy ** 2 != 0:
            hessianinv = hessian.I
            offset = -hessianinv * derivative
            offset = np.squeeze(np.array(offset.T), axis=0)
            coord += offset
    return coord


def dark_gaussian_blur(hm, kernel):
    border = (kernel - 1) // 2
    batch_size = hm.shape[0]
    num_joints = hm.shape[1]
    height = hm.shape[2]
    width = hm.shape[3]
    for i in range(batch_size):
        for j in range(num_joints):
            origin_max = np.max(hm[i,j])
            dr = np.zeros((height + 2 * border, width + 2 * border))
            dr[border: -border, border: -border] = hm[i,j].copy()
            dr = cv2.GaussianBlur(dr, (kernel, kernel), 0)
            hm[i,j] = dr[border: -border, border: -border].copy()
            hm[i,j] *= origin_max / np.max(hm[i,j])
    return hm


def dark_get_final_preds(config, hm, center, scale):
    coords, maxvals = dark_get_max_preds(hm)
    heatmap_height = hm.shape[2]
    heatmap_width = hm.shape[3]

    # post-processing
    hm = dark_gaussian_blur(hm, config.TEST.BLUR_KERNEL)
    hm = np.maximum(hm, 1e-10)
    hm = np.log(hm)
    for n in range(coords.shape[0]):
        for p in range(coords.shape[1]):
            coords[n,p] = dark_taylor(hm[n][p], coords[n][p])

    preds = coords.copy()

    # Transform back
    for i in range(coords.shape[0]):
        preds[i] = dark_transform_preds(
            coords[i], center[i], scale[i], [heatmap_width, heatmap_height]
        )

    return preds, maxvals

In [12]:
def res_get_max_preds(batch_heatmaps):
    '''
    get predictions from score maps
    heatmaps: numpy.ndarray([batch_size, num_joints, height, width])
    '''
    assert isinstance(batch_heatmaps, np.ndarray), \
        'batch_heatmaps should be numpy.ndarray'
    assert batch_heatmaps.ndim == 4, 'batch_images should be 4-ndim'

    batch_size = batch_heatmaps.shape[0]
    num_joints = batch_heatmaps.shape[1]
    width = batch_heatmaps.shape[3]
    heatmaps_reshaped = batch_heatmaps.reshape((batch_size, num_joints, -1))
    idx = np.argmax(heatmaps_reshaped, 2)
    maxvals = np.amax(heatmaps_reshaped, 2)

    maxvals = maxvals.reshape((batch_size, num_joints, 1))
    idx = idx.reshape((batch_size, num_joints, 1))

    preds = np.tile(idx, (1, 1, 2)).astype(np.float32)

    preds[:, :, 0] = (preds[:, :, 0]) % width
    preds[:, :, 1] = np.floor((preds[:, :, 1]) / width)

    pred_mask = np.tile(np.greater(maxvals, 0.0), (1, 1, 2))
    pred_mask = pred_mask.astype(np.float32)

    preds *= pred_mask
    return preds, maxvals


def res_get_final_preds(config, batch_heatmaps, center, scale):
    coords, maxvals = res_get_max_preds(batch_heatmaps)

    heatmap_height = batch_heatmaps.shape[2]
    heatmap_width = batch_heatmaps.shape[3]

    preds = coords.copy()

    # Transform back
    for i in range(coords.shape[0]):
        preds[i] = res_transform_preds(coords[i], center, scale,
                                   [heatmap_width, heatmap_height])

    return preds, maxvals

def res_transform_preds(coords, center, scale, output_size):
    target_coords = np.zeros(coords.shape)
    trans = res_get_affine_transform(center, scale, 0, output_size, inv=1)
    for p in range(coords.shape[0]):
        target_coords[p, 0:2] = res_affine_transform(coords[p, 0:2], trans)
    return target_coords


def res_get_affine_transform(center,
                         scale,
                         rot,
                         output_size,
                         shift=np.array([0, 0], dtype=np.float32),
                         inv=0):
    if not isinstance(scale, np.ndarray) and not isinstance(scale, list):
        print(scale)
        scale = np.array([scale, scale])

    scale_tmp = scale * 200.0
    src_w = scale_tmp[0]
    dst_w = output_size[0]
    dst_h = output_size[1]

    rot_rad = np.pi * rot / 180
    src_dir = res_get_dir([0, src_w * -0.5], rot_rad)
    dst_dir = np.array([0, dst_w * -0.5], np.float32)

    src = np.zeros((3, 2), dtype=np.float32)
    dst = np.zeros((3, 2), dtype=np.float32)
    src[0, :] = center + scale_tmp * shift
    src[1, :] = center + src_dir + scale_tmp * shift
    dst[0, :] = [dst_w * 0.5, dst_h * 0.5]
    dst[1, :] = np.array([dst_w * 0.5, dst_h * 0.5]) + dst_dir

    src[2:, :] = res_get_3rd_point(src[0, :], src[1, :])
    dst[2:, :] = res_get_3rd_point(dst[0, :], dst[1, :])

    if inv:
        trans = cv2.getAffineTransform(np.float32(dst), np.float32(src))
    else:
        trans = cv2.getAffineTransform(np.float32(src), np.float32(dst))

    return trans


def res_affine_transform(pt, t):
    new_pt = np.array([pt[0], pt[1], 1.]).T
    new_pt = np.dot(t, new_pt)
    return new_pt[:2]


def res_get_3rd_point(a, b):
    direct = a - b
    return b + np.array([-direct[1], direct[0]], dtype=np.float32)


def res_get_dir(src_point, rot_rad):
    sn, cs = np.sin(rot_rad), np.cos(rot_rad)

    src_result = [0, 0]
    src_result[0] = src_point[0] * cs - src_point[1] * sn
    src_result[1] = src_point[0] * sn + src_point[1] * cs

    return src_result

In [13]:
def affine_transform(pt, t):
    new_pt = np.array([pt[0], pt[1], 1.]).T
    new_pt = np.dot(t, new_pt)
    return new_pt[:2]

def get_dir(src_point, rot_rad):
    sn, cs = np.sin(rot_rad), np.cos(rot_rad)

    src_result = [0, 0]
    src_result[0] = src_point[0] * cs - src_point[1] * sn
    src_result[1] = src_point[0] * sn + src_point[1] * cs

    return src_result

def get_3rd_point(a, b):
    direct = a - b
    return b + np.array([-direct[1], direct[0]], dtype=np.float32)

def get_affine_transform(
        center, scale, rot, output_size,
        shift=np.array([0, 0], dtype=np.float32), inv=0
):
    if not isinstance(scale, np.ndarray) and not isinstance(scale, list):
        print(scale)
        scale = np.array([scale, scale])

    scale_tmp = scale * 200.0
    src_w = scale_tmp[0]
    dst_w = output_size[0]
    dst_h = output_size[1]

    rot_rad = np.pi * rot / 180
    src_dir = get_dir([0, src_w * -0.5], rot_rad)
    dst_dir = np.array([0, dst_w * -0.5], np.float32)

    src = np.zeros((3, 2), dtype=np.float32)
    dst = np.zeros((3, 2), dtype=np.float32)
    src[0, :] = center + scale_tmp * shift
    src[1, :] = center + src_dir + scale_tmp * shift
    dst[0, :] = [dst_w * 0.5, dst_h * 0.5]
    dst[1, :] = np.array([dst_w * 0.5, dst_h * 0.5]) + dst_dir

    src[2:, :] = get_3rd_point(src[0, :], src[1, :])
    dst[2:, :] = get_3rd_point(dst[0, :], dst[1, :])

    if inv:
        trans = cv2.getAffineTransform(np.float32(dst), np.float32(src))
    else:
        trans = cv2.getAffineTransform(np.float32(src), np.float32(dst))

    return trans

def transform_preds(coords, center, scale, output_size):
    target_coords = np.zeros(coords.shape)
    trans = get_affine_transform(center, scale, 0, output_size, inv=1)
    for p in range(coords.shape[0]):
        target_coords[p, 0:2] = affine_transform(coords[p, 0:2], trans)
    return target_coords

def get_max_preds(batch_heatmaps):
    '''
    get predictions from score maps
    heatmaps: numpy.ndarray([batch_size, num_joints, height, width])
    '''
    assert isinstance(batch_heatmaps, np.ndarray), \
        'batch_heatmaps should be numpy.ndarray'
    assert batch_heatmaps.ndim == 4, 'batch_images should be 4-ndim'

    batch_size = batch_heatmaps.shape[0]
    num_joints = batch_heatmaps.shape[1]
    width = batch_heatmaps.shape[3]
    heatmaps_reshaped = batch_heatmaps.reshape((batch_size, num_joints, -1))
    idx = np.argmax(heatmaps_reshaped, 2)
    maxvals = np.amax(heatmaps_reshaped, 2)

    maxvals = maxvals.reshape((batch_size, num_joints, 1))
    idx = idx.reshape((batch_size, num_joints, 1))

    preds = np.tile(idx, (1, 1, 2)).astype(np.float32)

    preds[:, :, 0] = (preds[:, :, 0]) % width
    preds[:, :, 1] = np.floor((preds[:, :, 1]) / width)

    pred_mask = np.tile(np.greater(maxvals, 0.0), (1, 1, 2))
    pred_mask = pred_mask.astype(np.float32)

    preds *= pred_mask
    return preds, maxvals


def get_final_preds(config, batch_heatmaps, center, scale):
    coords, maxvals = get_max_preds(batch_heatmaps)

    heatmap_height = batch_heatmaps.shape[2]
    heatmap_width = batch_heatmaps.shape[3]

    # post-processing
    if config.TEST.POST_PROCESS:
        for n in range(coords.shape[0]):
            for p in range(coords.shape[1]):
                hm = batch_heatmaps[n][p]
                px = int(math.floor(coords[n][p][0] + 0.5))
                py = int(math.floor(coords[n][p][1] + 0.5))
                if 1 < px < heatmap_width-1 and 1 < py < heatmap_height-1:
                    diff = np.array(
                        [
                            hm[py][px+1] - hm[py][px-1],
                            hm[py+1][px]-hm[py-1][px]
                        ]
                    )
                    coords[n][p] += np.sign(diff) * .25

    preds = coords.copy()

    # Transform back
    for i in range(coords.shape[0]):
        preds[i] = transform_preds(
            coords[i], center[i], scale[i], [heatmap_width, heatmap_height]
        )

    return preds, maxvals

def preprocess_image(img_path, c, s, cfg):
    flip_pairs = [[0, 5], [1, 4], [
            2, 3], [10, 15], [11, 14], [12, 13]]

    scale_factor = cfg.DATASET.SCALE_FACTOR
    rotation_factor = cfg.DATASET.ROT_FACTOR
    flip = cfg.DATASET.FLIP

    image_size = cfg.MODEL.IMAGE_SIZE
    target_type = cfg.MODEL.EXTRA.TARGET_TYPE
    heatmap_size = cfg.MODEL.EXTRA.HEATMAP_SIZE
    sigma = cfg.MODEL.EXTRA.SIGMA

    
    transform = transforms.Compose([
        transforms.ToTensor(),          
        transforms.Normalize(            
            mean=[0.485, 0.456, 0.406],
            std=[0.229, 0.224, 0.225]
        )
    ])

    data_numpy = img_path
    r = 0

    trans = res_get_affine_transform(c, s, r, image_size)
    input = cv2.warpAffine(
        data_numpy,
        trans,
        (int(image_size[0]), int(image_size[1])),
        flags=cv2.INTER_LINEAR)

    input = transform(input)

    return input


def predict_keypoints(origin_img, c, s, config, model, num_samples = 1):
    model.eval()

    all_preds = np.zeros((num_samples, config.MODEL.NUM_JOINTS, 3),
                         dtype=np.float32)
                
    input = preprocess_image(origin_img, c, s, config)
    input = input.unsqueeze(0)
    
    with torch.no_grad():
        output = model(input)

        num_images = input.size(0)
  
        preds, maxvals = res_get_final_preds(
            config, output.clone().cpu().numpy(), c, s)
        
        idx = 0
        all_preds[idx:idx + num_images, :, 0:2] = preds[:, :, 0:2]       
        all_preds = all_preds[:, :, 0:2] + 1.0
        
        return all_preds[0]

In [14]:
# function definition
def cosine_similarity(a, b):
  cos = ( np.dot(a, b) / 
          (np.linalg.norm(a) * np.linalg.norm(b)) )
  if cos < -1:
    return -1
  elif cos > 1:
    return 1
  else:
    return cos


def angle_degree(a, b):
  angle = (np.arccos(cosine_similarity(a, b)) / 
          np.pi) * 180
  if angle > 180:
    return 180.
  if angle < 0:
    return 0
  else:
    return angle

# @parameter
# part: contain 2 joint coordinate [[x0, y0], [x1, y1]]
def return_vector(part_id, skeleton):
  point_a = skeleton[part_id[0]]
  point_b = skeleton[part_id[1]]
  return point_a - point_b

def is_in_range(num, range1, range2):
  ceil = max(range1, range2)
  floor = min(range1, range2)
  return (num >= floor and num <= ceil)

def find_pose_id(ske_pred, keyparts, stages):
  assert ske_pred.shape == (NUM_KPTS, 2)
  avg_angle = 0
  for keypart in keyparts:
    part_a, part_b = keypart
    vec_a = return_vector(SKELETON[part_a], ske_pred)
    vec_b = return_vector(SKELETON[part_b], ske_pred)

    if sum(vec_a == 0) == 2 or sum(vec_b == 0) == 2:
      return -1
    
    pred_angle = angle_degree(vec_a, vec_b)
    avg_angle += pred_angle

  avg_angle /= len(keyparts)
  for i, stage in enumerate(stages):
    if is_in_range(avg_angle, stage[0], stage[1]):
      return i

def compare_skeleton(ske_true, ske_pred, excer_part, threshold= 10):
  assert ske_true.shape == (NUM_KPTS, 2)
  assert ske_pred.shape == (NUM_KPTS, 2)
  part_flag = []
  for part in excer_part:
    kpt_a, kpt_b = SKELETON[part]
    true_vec = ske_true[kpt_a] - ske_true[kpt_b]
    pred_vec = ske_pred[kpt_a] - ske_pred[kpt_b]
    angle = angle_degree(true_vec, pred_vec)
    part_flag.append(True if angle <= threshold else False)
  return part_flag

def draw_pose(keypoints, part_flag, excer_part, img, joint_thickness=6):
  assert keypoints.shape == (NUM_KPTS, 2)
  for i, part in enumerate(excer_part):
    kpt_a, kpt_b = SKELETON[part]
    c = COLOR["blue"] if part_flag[i] else COLOR["red"]
    x_a, y_a = keypoints[kpt_a]
    x_b, y_b = keypoints[kpt_b]
    cv2.circle(img, (int(x_a), int(y_a)), joint_thickness, COLOR["green"], -1)
    cv2.circle(img, (int(x_b), int(y_b)), joint_thickness, COLOR["green"], -1)
    cv2.line(img, (int(x_a), int(y_a)), (int(x_b), int(y_b)), c, 2)
    
def dark_get_pose_estimation_prediction(pose_model, cfg, image, center, scale):
  scale = np.array([scale, scale])
  rotation = 0

  # pose estimation transformation
  trans = dark_get_affine_transform(center, scale, rotation, cfg.MODEL.IMAGE_SIZE)
  model_input = cv2.warpAffine(
    image,
    trans,
    (int(cfg.MODEL.IMAGE_SIZE[0]), int(cfg.MODEL.IMAGE_SIZE[1])),
    flags=cv2.INTER_LINEAR)
  transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                          std=[0.229, 0.224, 0.225]),
  ])

  # pose estimation inference
  model_input = transform(model_input).unsqueeze(0)
  # switch to evaluate mode
  pose_model.eval()
  with torch.no_grad():
    # compute output heatmap
    output = pose_model(model_input)
    preds, _ = dark_get_final_preds(
      cfg,
      output.clone().cpu().numpy(),
      np.asarray([center]),
      np.asarray([scale]))

    return preds

def hr_get_pose_estimation_prediction(pose_model, cfg, image, center, scale):
  scale = np.array([scale, scale])
  rotation = 0

  # pose estimation transformation
  trans = get_affine_transform(center, scale, rotation, cfg.MODEL.IMAGE_SIZE)
  model_input = cv2.warpAffine(
    image,
    trans,
    (int(cfg.MODEL.IMAGE_SIZE[0]), int(cfg.MODEL.IMAGE_SIZE[1])),
    flags=cv2.INTER_LINEAR)
  transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                          std=[0.229, 0.224, 0.225]),
  ])

  # pose estimation inference
  model_input = transform(model_input).unsqueeze(0)
  # switch to evaluate mode
  pose_model.eval()
  with torch.no_grad():
    # compute output heatmap
    output = pose_model(model_input)
    preds, _ = get_final_preds(
      cfg,
      output.clone().cpu().numpy(),
      np.asarray([center]),
      np.asarray([scale]))

    return preds

def calculate_center_scale(box):
  x1, y1, x2, y2 = box
  center_x = (x2 + x1) / 2
  center_y = (y2 + y1) / 2
  width = x2 - x1
  height = y2 - y1
  
  center = np.array([center_x, center_y], dtype=np.float32)
  scale = max(width, height) / 200

  return center, scale

def get_person_box(model, img):
  results = model(img)
  
  data = results.xyxy[0].cpu().numpy()
    
  if len(data) == 0:
    return [-1], [-1]

  max_index = np.argmax(data[:, 4])
  max_element = data[max_index]
  bbox = max_element[:4]

  center, scale = calculate_center_scale(bbox)

  return center, scale
    
def load_yolo(version):
  yolo = torch.hub.load('ultralytics/yolov5', version, 
                        pretrained= True, _verbose= False)
  yolo.cuda()
  yolo.classes = [0]
  return yolo

def load_res(cfg):
  model = get_res_pose_net(cfg, is_train=True)
  gpus = [int(i) for i in poseres_cfg.GPUS.split(',')]
  model = torch.nn.DataParallel(model, device_ids=gpus).cuda()
  return model

def load_hrnet(cfg):
  pose_model = get_pose_net(cfg, is_train=False)
    
  print(cfg.TEST.MODEL_FILE)
  if cfg.TEST.MODEL_FILE:
    print('=> loading model from {}'.format(cfg.TEST.MODEL_FILE))
    pose_model.load_state_dict(torch.load(cfg.TEST.MODEL_FILE), strict=False)
  else:
    print('expected model defined in config at TEST.MODEL_FILE')

  pose_model = torch.nn.DataParallel(pose_model, device_ids=cfg.GPUS)
  pose_model.to(CTX)
  pose_model.eval()

  return pose_model

def load_dark(cfg):
  pose_model = get_dark_pose_net(cfg, is_train=False)
    
  print(cfg.TEST.MODEL_FILE)
  if cfg.TEST.MODEL_FILE:
    print('=> loading model from {}'.format(cfg.TEST.MODEL_FILE))
    pose_model.load_state_dict(torch.load(cfg.TEST.MODEL_FILE), strict=False)
  else:
    print('expected model defined in config at TEST.MODEL_FILE')

  pose_model = torch.nn.DataParallel(pose_model, device_ids=cfg.GPUS)
  pose_model.to(CTX)
  pose_model.eval()

  return pose_model

def load_true_poses(path):
  joints_path = os.path.join(path, "joints.csv")
  
  img_path = os.path.join(path, "images")
  
  img_paths = os.listdir(img_path)
  img_paths.sort(key= lambda x: int(x[0]))
  imgs = [cv2.imread(os.path.join(img_path, p)) for p in img_paths]
  skes = pd.read_csv(joints_path).to_numpy().reshape(len(imgs), NUM_KPTS, 2)
  return imgs, skes

In [30]:
def res_video_test(video_path, output_path, box_model, pose_model, true_pose_images, true_pose_skes,
                   cfg, keyparts, visual_parts, stage_angle, angle_threshold=5):

    # Open video file
    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        print("Cannot open this video.")
        return

    # Get video properties
    frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    fps = int(cap.get(cv2.CAP_PROP_FPS))

    # Create video writer
    out = cv2.VideoWriter(output_path, cv2.VideoWriter_fourcc(*'mp4v'), fps, 
                          (frame_width * 2, frame_height))
    count = 0
    pose_id = 0

    # Read and process frames
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        
        frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        
        center, scale = get_person_box(box_model, frame_rgb)
        if center[0] != -1:
            pose_preds = predict_keypoints(frame_rgb, np.array(center), 
                                           np.array([scale, scale]), cfg, pose_model)

            pose_id = find_pose_id(pose_preds, keyparts, stage_angle)

            true_pose_image = true_pose_images[pose_id]
            true_skeleton = true_pose_skes[pose_id]
            part_flag = compare_skeleton(true_skeleton, pose_preds, visual_parts, threshold=5)
            draw_pose(pose_preds, part_flag, visual_parts, frame)

            # Resize true pose image to match the frame size
            true_pose_resized = cv2.resize(true_pose_image, (frame_width, frame_height))

            # Combine frames side by side
            combined_frame = np.hstack((frame, true_pose_resized))
        else:
            true_pose_image = true_pose_images[0]
            true_pose_resized = cv2.resize(true_pose_image, (frame_width, frame_height))
            combined_frame = np.hstack((frame, true_pose_resized))

        # Write combined frame to output video
        out.write(combined_frame)


    cap.release()
    out.release()
    print("Video processing complete. Output saved to", output_path)

In [39]:
def hr_video_test(video_path, output_path, box_model, pose_model, true_pose_images, true_pose_skes,
                   cfg, keyparts, visual_parts, stage_angle, angle_threshold=5):
  
  video_cap = cv2.VideoCapture(video_path)
  if not video_cap.isOpened():
    print("Cannot open this video.")
    return
  
  frame_width = int(video_cap.get(cv2.CAP_PROP_FRAME_WIDTH))
  frame_height = int(video_cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
  fps = 24.0
  video_out = cv2.VideoWriter(output_path, cv2.VideoWriter_fourcc(*'mp4v'), 
                        fps, (frame_width * 2, frame_height))
  canvas_size = (frame_height, frame_width, 3)
  canvas = np.zeros(canvas_size)
  count = 0
  while True:
    pose_id = 0
      
    ret, image_bgr = video_cap.read()
    if not ret:
      break
    
    image = cv2.cvtColor(image_bgr, cv2.COLOR_BGR2RGB)
    center, scale = get_person_box(box_model, image)

    if center[0] == -1:
     continue

    image_pose = image.copy() if cfg.DATASET.COLOR_RGB else image_bgr.copy()
    pose_pred = hr_get_pose_estimation_prediction(pose_model, cfg, image_pose, center, scale)[0]
    pose_id = find_pose_id(pose_pred, keyparts, stage_angle)
    if pose_id == -1:
      continue
    
    true_pose_image = true_pose_images[pose_id]
    true_skeleton = true_pose_skes[pose_id]
    part_flag = compare_skeleton(true_skeleton, pose_pred, visual_parts, threshold= angle_threshold)
    draw_pose(pose_pred, part_flag, visual_parts, image_bgr)
    
    true_pose_resized = cv2.resize(true_pose_image, (frame_width, frame_height))
    combined_frame = np.hstack((image_bgr, true_pose_resized)).astype(np.uint8)
    video_out.write(combined_frame)
    
    count += 1
    if count % 24 == 0:
      print(count)
  
  video_cap.release()
  video_out.release()
  print("Video processing complete. Output saved to", output_path)

In [44]:
def dark_video_test(video_path, output_path, box_model, pose_model, true_pose_images, true_pose_skes,
                   cfg, keyparts, visual_parts, stage_angle, angle_threshold=5):
  
  video_cap = cv2.VideoCapture(video_path)
  if not video_cap.isOpened():
    print("Cannot open this video.")
    return
  
  frame_width = int(video_cap.get(cv2.CAP_PROP_FRAME_WIDTH))
  frame_height = int(video_cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
  fps = 24.0
  video_out = cv2.VideoWriter(output_path, cv2.VideoWriter_fourcc(*'mp4v'), 
                        fps, (frame_width * 2, frame_height))
  canvas_size = (frame_height, frame_width, 3)
  canvas = np.zeros(canvas_size)
  count = 0
  while True:
    pose_id = 0
      
    ret, image_bgr = video_cap.read()
    if not ret:
      break
    
    image = cv2.cvtColor(image_bgr, cv2.COLOR_BGR2RGB)
    center, scale = get_person_box(box_model, image)

    if center[0] == -1:
     continue

    image_pose = image.copy() if cfg.DATASET.COLOR_RGB else image_bgr.copy()
    pose_pred = dark_get_pose_estimation_prediction(pose_model, cfg, image_pose, center, scale)[0]
    pose_id = find_pose_id(pose_pred, keyparts, stage_angle)
    if pose_id == -1:
      continue
    
    true_pose_image = true_pose_images[pose_id]
    true_skeleton = true_pose_skes[pose_id]
    part_flag = compare_skeleton(true_skeleton, pose_pred, visual_parts, threshold= angle_threshold)
    draw_pose(pose_pred, part_flag, visual_parts, image_bgr)

    true_pose_resized = cv2.resize(true_pose_image, (frame_width, frame_height))
    combined_frame = np.hstack((image_bgr, true_pose_resized)).astype(np.uint8)
    video_out.write(combined_frame)
    
    count += 1
    if count % 24 == 0:
      print(count)
  
  video_cap.release()
  video_out.release()
  print("Video processing complete. Output saved to", output_path)

# Demo

In [24]:
squat_dir = "/kaggle/input/cva-inputs/squat"
jumping_jack_dir = "/kaggle/input/cva-inputs/jumping_jack"
box_model = load_yolo("yolov5s")
squat_images, squat_skes = load_true_poses(squat_dir)
jumping_jack_images, jumping_jack_skes = load_true_poses(jumping_jack_dir)
respose_model = load_res(poseres_cfg)
hrpose_model = load_hrnet(HRnet_cfg)
darkpose_model = load_dark(DarkPose_cfg)

Using cache found in /root/.cache/torch/hub/ultralytics_yolov5_master


Load pretrained model successfully!
/kaggle/input/cva-models/hrpose_w32_256x256.pth
=> loading model from /kaggle/input/cva-models/hrpose_w32_256x256.pth
/kaggle/input/cva-models/dark_w32_256×256.pth
=> loading model from /kaggle/input/cva-models/dark_w32_256×256.pth


In [27]:
squat_input_video = "/kaggle/input/cva-inputs/stop_doing.mp4"
jumping_jack_input_video = "/kaggle/input/cva-inputs/tue_jumping_jack_1.mp4"

## Pose Resnet

In [35]:
res_output_video = "res_output_video.mp4"
res_video_test(squat_input_video, res_output_video, box_model, respose_model, squat_images,
               squat_skes, poseres_cfg, SQUAT_KEYPART, SQUAT_PART, SQUAT_STAGE_ANGLE)

  cos = ( np.dot(a, b) /
  cos = ( np.dot(a, b) /


Video processing complete. Output saved to res_output_video.mp4


In [41]:
res_video_test(jumping_jack_input_video, res_output_video, box_model, respose_model, 
               jumping_jack_images, jumping_jack_skes, poseres_cfg, JUMPING_JACK_KEYPART, 
               JUMPING_JACK_PART, JUMPING_JACK_STAGE_ANGLE)

Video processing complete. Output saved to res_output_video.mp4


## HRnet

In [40]:
hr_output_video = "hr_output_video.mp4"
hr_video_test(squat_input_video, hr_output_video, box_model, hrpose_model, squat_images,
               squat_skes, HRnet_cfg, SQUAT_KEYPART, SQUAT_PART, SQUAT_STAGE_ANGLE)

24
48
72
96
120
144
168
192
216
240
Video processing complete. Output saved to hr_output_video.mp4


In [42]:
hr_video_test(jumping_jack_input_video, hr_output_video, box_model, hrpose_model, 
              jumping_jack_images, jumping_jack_skes, HRnet_cfg, JUMPING_JACK_KEYPART, 
               JUMPING_JACK_PART, JUMPING_JACK_STAGE_ANGLE)

24
48
72
96
120
144
168
192
216
Video processing complete. Output saved to hr_output_video.mp4


## DarkPose

In [45]:
dark_output_video = "dark_output_video.mp4"
dark_video_test(squat_input_video, dark_output_video, box_model, hrpose_model, squat_images, 
                squat_skes, DarkPose_cfg, SQUAT_KEYPART, SQUAT_PART, SQUAT_STAGE_ANGLE)

24
48
72
96
120
144
168
192
216
240
Video processing complete. Output saved to dark_output_video.mp4


In [46]:
dark_video_test(jumping_jack_input_video, dark_output_video, box_model, hrpose_model, 
                jumping_jack_images, jumping_jack_skes, DarkPose_cfg, JUMPING_JACK_KEYPART, 
                JUMPING_JACK_PART, JUMPING_JACK_STAGE_ANGLE)

24
48
72
96
120
144
168
192
216
Video processing complete. Output saved to dark_output_video.mp4
