# Pose Dection

In [1]:
import os

import numpy as np
import openvino as ov
from typing import Any
import json
import time

import cv2

from models.rtmpose.deploy_infer import (
    Compose,
    GetBBoxCenterScale,
    LoadImage,
    PackPoseInputs,
    PoseDataPreprocessor,
    TopdownAffine,
    prepare_data,
    restore_keypoints,
)

`requirements_demo.txt` contains packages and their verision that are be required to run this code

### Setup model

In [2]:
DEPLOYED_MODELS = [
    "rtm_body8_26keypoints_pose-m_256x192",
    "rtm_coco_pose-l",
    "rtm_body8_26keypoints_pose-m_384x288",
    "rtm_coco_pose-m",
    "rtm_body8_pose-s",
    "rtm_body8_pose-m",
]

MODEL_NM = "rtm_body8_26keypoints_pose-m_384x288"

assert MODEL_NM in DEPLOYED_MODELS, f"Model {MODEL_NM} not found in DEPLOYED_MODELS"

if "384x288" in MODEL_NM:
    INPUT_SIZE = 288, 384
    OUTPUT_SIZE = 576, 768
else:
    INPUT_SIZE = 192, 256
    OUTPUT_SIZE = 384, 512

if "26keypoints" in MODEL_NM:
    N_KEYPOINTS = 26
else:
    N_KEYPOINTS = 17

In [3]:
core = ov.Core()
model = f"./out/{MODEL_NM}/{MODEL_NM}.xml"

In [4]:
%%timeit
compiled_model = core.compile_model(
    model=model,
    device_name="CPU",  # , config={"DYN_BATCH_ENABLED": "YES"}
)

91.4 ms ± 601 μs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [5]:
compiled_model = core.compile_model(
    model=model,
    device_name="CPU",  # , config={"DYN_BATCH_ENABLED": "YES"}
)

In [6]:
compiled_model

<CompiledModel:
inputs[
<ConstOutput: names[image] shape[?,3,384,288] type: f32>
]
outputs[
<ConstOutput: names[707] shape[?,26,576] type: f32>,
<ConstOutput: names[709] shape[?,26,768] type: f32>
]>

Model have dynamic batch size to allow running them once per image (that could have multiple people in it)

### Preprocess images

Preprocessing from oryginal repo depended on multiple packages (mmpose, mmcv, mmengine) this introduces many restrictions (e.g. the appropriate version of mmcv will not work with python>=3.10, torch>=2.0).

To overcome this all necessary transformation have been extracted to `models.rtmpose.deploy_mmpose_replacement.py` (leaving only dependency on cv2, numpy, torch but if this packages won't introduce breaking change code should be fine and it is working with lastest version (e.g. we can use python3.11 and torch 2.3.1))

Those depending on use case you might want to change preprocessing steps (probably LoadImage and PoseDataPreprocessor might need to be changed depending on what is the input to the model).

Pipeline works on whole image and bboxes (preprocessing steps have some parameters to steer things like padding, etc.)

In [7]:
pipeline = Compose(
    [
        LoadImage(),
        GetBBoxCenterScale(),
        TopdownAffine(input_size=INPUT_SIZE),
        PackPoseInputs(),
    ]
)

In [8]:
data_preprocessor = PoseDataPreprocessor(
    mean=[123.675, 116.28, 103.53],
    std=[58.395, 57.12, 57.375],
    bgr_to_rgb=True,  # if isinstance(img, np.ndarray) then check if bgr_to_rgb is required
)

### Inference

In [9]:
def inference(img: str | np.ndarray, bboxes: np.ndarray = None):
    if bboxes is None:
        h, w, _ = img.shape
        bboxes = np.array([[0, 0, w, h]], dtype=np.float32)

    data_list, preprocessed_image = prepare_data(
        img, bboxes, pipeline, data_preprocessor
    )

    infer_request = compiled_model.create_infer_request()

    input_tensor = ov.Tensor(array=preprocessed_image.numpy(), shared_memory=True)
    infer_request.set_input_tensor(input_tensor)

    infer_request.set_output_tensor(
        0,
        ov.Tensor(
            np.zeros((bboxes.shape[0], N_KEYPOINTS, OUTPUT_SIZE[0]), dtype=np.float32)
        ),
    )
    infer_request.set_output_tensor(
        1,
        ov.Tensor(
            np.zeros((bboxes.shape[0], N_KEYPOINTS, OUTPUT_SIZE[1]), dtype=np.float32)
        ),
    )

    infer_request.start_async()
    infer_request.wait()

    simcc_x = infer_request.get_output_tensor(0).data
    simcc_y = infer_request.get_output_tensor(1).data

    openvino_pred, openvino_scores = restore_keypoints(simcc_x, simcc_y, data_list)

    return {
        "keypoints": openvino_pred,
        "keypoint_scores": openvino_scores,
    }

In [10]:
def load_json(file_path: str) -> Any:
    with open(file_path, "r") as file:
        return json.load(file)


def load_img(filename: str) -> np.ndarray:
    with open(filename, "rb") as f:
        value = f.read()
    img_np = np.frombuffer(value, np.uint8)
    flag = cv2.IMREAD_COLOR
    img = cv2.imdecode(img_np, flag)
    return img

In [11]:
load_json("wycinki/cam1_1/objects_pos.json")

{'object_1': {'ymin': 436, 'ymax': 1068, 'xmin': 1004, 'xmax': 1225},
 'object_2': {'ymin': 443, 'ymax': 1080, 'xmin': 536, 'xmax': 842},
 'object_3': {'ymin': 260, 'ymax': 542, 'xmin': 302, 'xmax': 390},
 'object_4': {'ymin': 142, 'ymax': 362, 'xmin': 905, 'xmax': 970}}

In [12]:
def bboxes_to_xywh(bboxes: dict[str, dict[str, int]]) -> np.ndarray:
    return np.array(
        [
            [
                bbox["xmin"],
                bbox["ymin"],
                bbox["xmax"],
                bbox["ymax"],
            ]
            for bbox in bboxes.values()
        ]
    )

In [13]:
img_root = "./wycinki/cam1_1"

Bounding boxes needs to be in format [[x, y, w, h], ...] there is example how to prepare it from `objects_pos.json` file that was delivered to us.

In [14]:
bboxes = bboxes_to_xywh(load_json(os.path.join(img_root, "objects_pos.json")))
bboxes

array([[1004,  436, 1225, 1068],
       [ 536,  443,  842, 1080],
       [ 302,  260,  390,  542],
       [ 905,  142,  970,  362]])

In [15]:
img = load_img(os.path.join(img_root, "image.jpg"))

Specified pipeline works for the images as numpy arrays in shape (w, h, d) (bgr_to_rgb=True in PoseDataPreprocessor)

In [16]:
img.shape

(1080, 1920, 3)

The inference function returns dict with keypoints and keypoint_scores.

- keypoints: contains numpy array with shape (n, k, 2)
- keypoint_scores: contains numpy array with shape (n, k)

where

* n - number of bboxes,
* k - number of keypoints (depending on model),
* 2 - (x, y) coordinates 

The default threshold for keypoint to be assigned as recognized is 0.3 (used for visualization, change it if you want to display keypoint about which model is more sure of)

In [17]:
%%timeit
_ = inference(img, bboxes)

63.4 ms ± 1.59 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [18]:
results = inference(img, bboxes)

In [19]:
results.keys()

dict_keys(['keypoints', 'keypoint_scores'])

In [20]:
results["keypoints"].shape

(4, 26, 2)

In [21]:
results["keypoint_scores"].shape

(4, 26)

### Visualization

File `models/rtmpose/utils.py` contains `to_key_points` function that transforms keypoints and defines config (skeleton) that is later used by `draw_pose.get_pose` function to draw visualization of each person pose.

In [22]:
import models.rtmpose.utils as rtm_utils
import draw_pose

In [23]:
poses, config = rtm_utils.to_key_points(results, threshold=0.001)

In [24]:
def load_image(file_path: str) -> np.ndarray:
    return cv2.imread(file_path)

In [25]:
input_img = load_image(os.path.join(img_root, "image.jpg"))

In [None]:
import matplotlib.pyplot as plt

for pose in poses:
    poses, config = rtm_utils.to_key_points(results, threshold=0.999)
    input_img = draw_pose.get_pose(
        input_img, key_points=pose, edges=config, line_width=3
    )
# draw_pose.draw_pose()
fig = plt.figure(figsize=(10, 10))
input_img = cv2.cvtColor(input_img, cv2.COLOR_BGR2RGB)
plt.imshow(input_img)
plt.axis("off")
plt.show()

In [27]:
# output of cells above is not present due to uncertainty of whether the image can be publicly shared