In [1]:
import os
import numpy as np
import torch
from common.model import TemporalModel
from common.generators import UnchunkedGenerator
from common.camera import normalize_screen_coordinates, image_coordinates, camera_to_world
from common.visualization import render_animation
from common.skeleton import Skeleton
from common.utils import deterministic_random

In [45]:
video_path = 'inputdir/Clip14Miss.mp4'
keypoints_path = 'data/custom/Clip14Miss.npz'
output_keypoints_3d_path = 'outputdir/Clip14Miss3D.npz'
checkpoint_path = 'checkpoint/pretrained_h36m_detectron_coco.bin'

In [46]:
metadata = {
    'layout_name': 'coco',
    'num_joints': 17,
    'keypoints_symmetry': ([1,3,5,7,9,11,13,15], [2,4,6,8,10,12,14,16])
}

In [47]:
keypoints_data = np.load(keypoints_path, allow_pickle=True)
keypoints = keypoints_data['keypoints'] # (1, T, 17, 2)
print(keypoints.shape)
keypoints = keypoints.astype('float32')


(68, 17, 2)


In [48]:
W, H = 1280, 720  # Update if your video resolution is different
keypoints[0] = normalize_screen_coordinates(keypoints[0], w=W, h=H)

In [49]:
model = TemporalModel(
    num_joints_in=17, in_features=2, num_joints_out=17,
    filter_widths=[3, 3, 3, 3, 3],
    causal=False, dropout=0.25, channels=1024
)

In [50]:
checkpoint = torch.load(checkpoint_path, map_location=torch.device('cpu'))
model.load_state_dict(checkpoint['model_pos'])
model.eval()

TemporalModel(
  (drop): Dropout(p=0.25, inplace=False)
  (relu): ReLU(inplace=True)
  (expand_bn): BatchNorm1d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (shrink): Conv1d(1024, 51, kernel_size=(1,), stride=(1,))
  (expand_conv): Conv1d(34, 1024, kernel_size=(3,), stride=(1,), bias=False)
  (layers_conv): ModuleList(
    (0): Conv1d(1024, 1024, kernel_size=(3,), stride=(1,), dilation=(3,), bias=False)
    (1): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,), bias=False)
    (2): Conv1d(1024, 1024, kernel_size=(3,), stride=(1,), dilation=(9,), bias=False)
    (3): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,), bias=False)
    (4): Conv1d(1024, 1024, kernel_size=(3,), stride=(1,), dilation=(27,), bias=False)
    (5): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,), bias=False)
    (6): Conv1d(1024, 1024, kernel_size=(3,), stride=(1,), dilation=(81,), bias=False)
    (7): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,), bias=False)
  )
  (layers_bn): Mod

In [51]:
pad = (model.receptive_field() - 1) // 2
inputs_2d = [np.pad(keypoints, ((pad, pad), (0, 0), (0, 0)), mode='edge')]

gen = UnchunkedGenerator(None, None, inputs_2d, pad=pad)

for _,_,batch_2d in gen.next_epoch():
    batch_2d=torch.from_numpy(batch_2d).float()
    predicted_3d = model(batch_2d)[0].detach().numpy()

predicted_3d -= predicted_3d[:, :1, :]  # Root-relative
predicted_3d.shape

(310, 17, 3)

In [52]:
np.savez_compressed(output_keypoints_3d_path, keypoints_3d=predicted_3d)
print(f"[✅] Saved 3D keypoints to {output_keypoints_3d_path}")


[✅] Saved 3D keypoints to outputdir/Clip14Miss3D.npz
