use motionclip_py310_v2 env

In [1]:
if "_magic_done" not in globals(): # prevent multiple run
    %load_ext autoreload
    %autoreload 2
    %cd ./MotionCLIP
    _magic_done = True

/home/asad/workspace/DomainProject/changeDomain/notebooks/pose/4.motion_clip/MotionCLIP


In [2]:
import sys
sys.path.append('.')

import clip
import joblib
from IPython.display import Video

# MotionClip
from src.parser.visualize import parser
from src.datasets.get_dataset import get_datasets
import src.utils.rotation_conversions as geometry

In [3]:
sys.argv = [
    "notebook",  # dummy script name
    "./exps/paper-model/checkpoint_0100.pth.tar",
    "--input_file", "./assets/paper_edits.csv",
]


parameters, folder, checkpointname, epoch = parser()

In [4]:
clip_model, clip_preprocess = clip.load("ViT-B/32", device=parameters['device'], jit=False)  # Must set jit=False for training
clip.model.convert_weights(clip_model)  # Actually this line is unnecessary since clip by default already on float16

if parameters.get('clip_training', '') == '':
    clip_model.eval()
    for p in clip_model.parameters():
        p.requires_grad = False

split='test'
# split='all'  # need more memory
datasets = get_datasets(parameters, clip_preprocess, split)

datapath used by amass is [./data/amass_db/amass_30fps_test.pt]


In [5]:
data = joblib.load("./data/amass_db/amass_30fps_test.pt")
print(data.keys())
print("number of samples:", len(data["thetas"]))
print(
    "example shapes:",
    data["thetas"][0].shape,
    data["thetas"][1].shape,
    data["thetas"][2].shape,
    sep="\n\t")

dict_keys(['vid_names', 'thetas', 'joints3d', 'clip_images', 'clip_pathes', 'text_raw_labels', 'text_proc_labels', 'action_cat'])
number of samples: 95
example shapes:
	(183, 72)
	(304, 72)
	(171, 72)


In [6]:
print(datasets) # train and test dataset are equal
print(datasets["test"][0].keys())
print(
    "example shapes:", 
    datasets["train"][0]["inp"].shape,
    datasets["train"][1]["inp"].shape,
    datasets["train"][2]["inp"].shape,
    sep="\n\t")

{'train': amass dataset: (157, _, ..), 'test': amass dataset: (157, _, ..)}
dict_keys(['inp', 'target', 'clip_image', 'clip_path', 'clip_text'])
example shapes:
	torch.Size([25, 6, 60])
	torch.Size([25, 6, 60])
	torch.Size([25, 6, 60])


# try to visualize from file

In [7]:
import torch
import trimesh
import pyrender

from smplx import SMPL

In [8]:
# Load SMPL model
smpl_model = SMPL(model_path='./models/smpl')

# Example theta
idx = 0
theta = torch.from_numpy(data["thetas"][0][idx:idx+1]).to(torch.float)
output = smpl_model(body_pose=theta[:, 3:], global_orient=theta[:, :3])

vertices = output.vertices.detach().cpu().numpy().squeeze()
faces = smpl_model.faces

# Render using trimesh + pyrender
mesh = trimesh.Trimesh(vertices, faces)
scene = pyrender.Scene()
scene.add(pyrender.Mesh.from_trimesh(mesh))

# uncomment for change view and get matrix
# viewer = pyrender.Viewer(scene, use_raymond_lighting=True )
# viewer._camera_node.matrix



<pyrender.node.Node at 0x7fd598742d70>

In [9]:
import torch
from smplx import SMPL
import trimesh
import pyrender
import numpy as np
import imageio
from tqdm import tqdm


def thetas_to_video(motion, output_path):
    # Load SMPL model
    smpl_model = SMPL(model_path='./models/smpl')
    faces = smpl_model.faces

    scene = pyrender.Scene()
    r = pyrender.OffscreenRenderer(640, 480)

    # Add a camera
    camera = pyrender.PerspectiveCamera(yfov=np.pi / 3.0)
    cam_pose = np.array([
        [ 1.00, 0.05,-0.03,-0.10],
        [-0.05, 0.25,-0.97,-2.78],
        [-0.04, 0.97, 0.25, 0.61],
        [ 0.  , 0.  , 0.  , 1.  ]]
    )
    scene.add(camera, pose=cam_pose)

    # Add light
    light = pyrender.DirectionalLight(color=np.ones(3), intensity=3.0)
    scene.add(light, pose=cam_pose)

    # Suppose data["thetas"][0] has shape [T, 72]
    frames = []
    T = len(motion)
    if isinstance(motion, np.ndarray):
        motion = torch.from_numpy(motion)
    motion = motion.to(torch.float)
    for i in tqdm(range(T), desc="Rendering frames"):
        theta = motion[i:i+1]
        output = smpl_model(body_pose=theta[:, 3:], global_orient=theta[:, :3])

        vertices = output.vertices.detach().cpu().numpy().squeeze()
        mesh_visual = pyrender.Mesh.from_trimesh(trimesh.Trimesh(vertices, faces))

        # Clear old mesh, add new one
        scene.clear()
        scene.add(camera, pose=cam_pose)
        scene.add(light, pose=cam_pose)
        scene.add(mesh_visual)

        # Render and save frame
        color, _ = r.render(scene)
        frames.append(color)

    r.delete()


    # Make sure ffmpeg is installed: pip install imageio[ffmpeg]
    with imageio.get_writer(output_path, fps=30, format='ffmpeg') as writer:
        for frame in frames:
            writer.append_data(frame)

    print(f"✅ Saved video as {output_path}")

In [10]:
motion = data["thetas"][0]
output_path = "../output/2.smpl_motion_file.mp4"
thetas_to_video(motion, output_path)
Video(output_path, embed=True)



Rendering frames: 100%|██████████| 183/183 [00:01<00:00, 109.22it/s]


✅ Saved video as ../output/2.smpl_motion_file.mp4


# try to visualize from dataset

here we extract code to convert thetas and joint3d to *inp* and reverse

in this code use 24 * 3 -> 24 * 6 and reverse. last used for joint3d offset from first frame.

In [11]:
import joblib
import torch
# import torchgeometry as geometry  # Assuming this is available
import numpy as np


# Load the data
data = joblib.load("./data/amass_db/amass_30fps_test.pt")


def theta_to_inp(thetas, joints3d, translation = True):
    # Parameters (similar to before)
    pose_rep = "rot6d"
    
    glob = True
    num_frames = -1  # Take all frames
    max_len = -1

    # For the first sample (index 0)

    nframes = thetas.shape[0]

    # Determine frame_ix
    if num_frames == -1 and (max_len == -1 or nframes <= max_len):
        frame_ix = np.arange(nframes)
    else:
        # Add logic if needed, but for simplicity, take all
        pass

    # Load rotvec
    pose = thetas[frame_ix].reshape(-1, 24, 3)  # 72 / 3 = 24 (23 joints + global)

    if not glob:
        pose = pose[:, 1:, :]

    pose = torch.from_numpy(pose)

    # To rot6d
    ret = geometry.matrix_to_rotation_6d(geometry.axis_angle_to_matrix(pose))

    # Translation from joints3d
    if translation:
        joints3D = joints3d[frame_ix]
        joints3D = joints3D - joints3D[0, 0, :]
        ret_tr = torch.from_numpy(joints3D[:, 0, :])  # root joint

        padded_tr = torch.zeros((ret.shape[0], ret.shape[2]), dtype=ret.dtype)
        padded_tr[:, :3] = ret_tr
        ret = torch.cat((ret, padded_tr[:, None]), 1)

    # Permute to (joints, feats, frames)
    return ret.permute(1, 2, 0).contiguous().float()


# Revert to (nframes, 25, 6)
def inp_to_theta(inp):
    ret = inp.permute(2, 0, 1)

    # Separate rotations and translation
    rot_part = ret[:, :24, :]  # (nframes, 24, 6)

    # Revert rot6d to matrix to axis_angle
    matrix = geometry.rotation_6d_to_matrix(rot_part)
    axis_angle = geometry.matrix_to_axis_angle(matrix)  # (nframes, 24, 3)

    # Flatten to thetas
    return axis_angle.reshape(axis_angle.shape[0], -1)  # (nframes, 72)






ind = 2
thetas = data["thetas"][ind]  # (frames, 72)
joints3d = data["joints3d"][ind]  # Assuming (frames, num_joints, 3)

inp = theta_to_inp(thetas, joints3d)
inp = theta_to_inp(thetas, joints3d=None, translation=False)
print(inp.shape)

thetas = inp_to_theta(inp)
print(thetas.shape)  # (nframes, 72)



output_path = "../output/2.smpl_motion_dataset.mp4"
thetas_to_video(thetas, output_path)

Video(output_path, embed=True)

torch.Size([24, 6, 171])
torch.Size([171, 72])


Rendering frames: 100%|██████████| 171/171 [00:01<00:00, 106.23it/s]


✅ Saved video as ../output/2.smpl_motion_dataset.mp4


In [12]:
# each sample in file can converted to multiple sample. e.g. sample 4 in datast from index 2 in file
# dataset select random frames

motion = inp_to_theta(datasets["train"][4]["inp"])
output_path = "../output/2.smpl_motion_dataset_revert.mp4"
thetas_to_video(motion, output_path)
Video(output_path, embed=True)



Rendering frames: 100%|██████████| 60/60 [00:00<00:00, 102.59it/s]


✅ Saved video as ../output/2.smpl_motion_dataset_revert.mp4
