In [1]:
import os
import torch

from models.model import ViTPose
from configs.ViTPose_base_coco_256x192 import model as model_cfg

# Torch to ONNX

In [4]:
CKPT_PATH = "/home/cgusti/ViTPose_pytorch/checkpoints/vitpose-b-multi-coco.pth"
C, H, W = (3, 256, 192)

model = ViTPose(model_cfg)
ckpt = torch.load(CKPT_PATH)
model.load_state_dict(ckpt['state_dict'])
model.eval()

output_onnx = 'vitpose_dynamic.onnx'
input_names = ["input_0"]
output_names = ["output_0"]

device = next(model.parameters()).device
inputs = torch.randn(1, C, H, W).to(device)

dynamic_axes = {'input_0' : {0 : 'batch_size'},
                'output_0' : {0 : 'batch_size'}}

torch_out = torch.onnx.export(model, inputs, output_onnx, export_params=True, verbose=False,
                              input_names=input_names, output_names=output_names, 
                              opset_version=11, dynamic_axes = dynamic_axes)
print(f">>> Saved at: {os.path.abspath(output_onnx)}")

verbose: False, log level: Level.ERROR

>>> Saved at: /home/cgusti/ViTPose_pytorch/vitpose_dynamic.onnx


# Inference with ONNX

In [12]:
IMG_PATH = "/home/cgusti/ViTPose_pytorch/examples/yoga_pose_3.jpeg"

import onnx
import onnxruntime

import cv2
import numpy as np
import matplotlib.pyplot as plt

from time import time
from PIL import Image
from torchvision.transforms import transforms

from utils.visualization import draw_points_and_skeleton, joints_dict
from utils.dist_util import get_dist_info, init_dist
from utils.top_down_eval import keypoints_from_heatmaps

def to_numpy(tensor):
    return tensor.detach().cpu().numpy() if tensor.requires_grad else tensor.cpu().numpy()

ort_session = onnxruntime.InferenceSession(output_onnx)

# Prepare input data
img = Image.open(IMG_PATH)

org_w, org_h = img.size
print(f">>> Original image size: {org_h} X {org_w} (height X width)")
print(f">>> Resized image size: {H} X {W} (height X width)")
print(f">>> Scale change: {org_h/H}, {org_w/W}")
img_tensor = transforms.Compose (
    [transforms.Resize((H, W)),
        transforms.ToTensor()]
)(img).unsqueeze(0).to(device)

# Feed to model
tic = time()
ort_inputs = {ort_session.get_inputs()[0].name: to_numpy(img_tensor)}
heatmaps = ort_session.run(None, ort_inputs)[0]
# heatmaps = vit_pose(img_tensor).detach().cpu().numpy() # N, 17, h/4, w/4
elapsed_time = time()-tic
print(f">>> Output size: {heatmaps.shape} ---> {elapsed_time:.4f} sec. elapsed [{elapsed_time**-1: .1f} fps]\n")    

print('this line is working')
# points = heatmap2coords(heatmaps=heatmaps, original_resolution=(org_h, org_w))
points, prob = keypoints_from_heatmaps(heatmaps=heatmaps, center=np.array([[org_w//2, org_h//2]]), scale=np.array([[org_w, org_h]]),
                                        unbiased=True, use_udp=True)
points = np.concatenate([points[:, :, ::-1], prob], axis=2)
print('this line is working2')
# Visualization 
for pid, point in enumerate(points):
    print('this line is working3')
    img = draw_points_and_skeleton(img.copy(), point, joints_dict()['coco']['skeleton'], person_index=pid,
                                    points_color_palette='gist_rainbow', skeleton_color_palette='jet',
                                    points_palette_samples=10, confidence_threshold=0.4)
    plt.figure(figsize=(5,10))
    print('showing image')
    plt.imshow(img)
    print('showing image is successful')
    plt.title("Result")
    plt.axis('off')
    plt.show()

>>> Original image size: 1000 X 1500 (height X width)
>>> Resized image size: 256 X 192 (height X width)
>>> Scale change: 3.90625, 7.8125
>>> Output size: (1, 17, 64, 48) ---> 0.0642 sec. elapsed [ 15.6 fps]

this line is working
this line is working2
this line is working3


error: OpenCV(4.7.0) :-1: error: (-5:Bad argument) in function 'line'
> Overload resolution failed:
>  - img is not a numpy array, neither a scalar
>  - Expected Ptr<cv::UMat> for argument 'img'
