In [1]:
if "_magic_done" not in globals(): # prevent multiple run
    %load_ext autoreload
    %autoreload 2
    %cd ./MotionCLIP
    _magic_done = True

/home/asad/workspace/DomainProject/changeDomain/notebooks/pose/4.motion_clip/MotionCLIP


In [2]:
import os
import sys
sys.path.append('.')

import clip
import torch
import numpy as np

# MotionClip
from src.parser.visualize import parser
from src.utils.misc import load_model_wo_clip
from src.datasets.get_dataset import get_datasets
from src.models.get_model import get_model as get_gen_model

In [3]:
sys.argv = [
    "notebook",  # dummy script name
    "./exps/paper-model/checkpoint_0100.pth.tar",
    "--input_file", "./assets/paper_edits.csv",
]


parameters, folder, checkpointname, epoch = parser()

In [4]:
clip_model, clip_preprocess = clip.load("ViT-B/32", device=parameters['device'], jit=False)  # Must set jit=False for training
clip.model.convert_weights(clip_model)  # Actually this line is unnecessary since clip by default already on float16

if parameters.get('clip_training', '') == '':
    clip_model.eval()
    for p in clip_model.parameters():
        p.requires_grad = False

In [5]:
split='test'
# split='all'  # need more memory
datasets = get_datasets(parameters, clip_preprocess, split)

datapath used by amass is [./data/amass_db/amass_30fps_test.pt]


In [6]:
# from src.config import SMPL_DATA_PATH, SMPL_MODEL_PATH
# SMPL_DATA_PATH = "ok"
# SMPL_MODEL_PATH = os.path.join(SMPL_DATA_PATH, "SMPL_NEUTRAL.pkl")
model = get_gen_model(parameters, clip_model)

print("Restore weights..")
checkpointpath = os.path.join(folder, checkpointname)
state_dict = torch.load(checkpointpath, map_location=parameters["device"])
load_model_wo_clip(model, state_dict)

  state_dict = torch.load(checkpointpath, map_location=parameters["device"])


Restore weights..


In [7]:
# def retrieve_motions(datasets, motion_collection, texts, device):
#     retrieved_motions = []
#     for txt in texts:
#         _split, _index = motion_collection[txt][0]
#         retrieved_motions.append(datasets[_split][_index]['inp'].unsqueeze(0).to(device))
#     return torch.cat(retrieved_motions, axis=0)

def retrieve_motions(inp_list, device):
    retrieved_motions = []
    for inp in inp_list:
        retrieved_motions.append(inp.unsqueeze(0).to(device))
    return torch.cat(retrieved_motions, axis=0)


def encode_motions(model, motions, device):
    return model.encoder({'x': motions,
                          'y': torch.zeros(motions.shape[0], dtype=int, device=device),
                          'mask': model.lengths_to_mask(torch.ones(motions.shape[0], dtype=int, device=device) * 60)})["mu"]

inp_list = [
    datasets["test"][0]["inp"],
    datasets["test"][2]["inp"],
    datasets["test"][3]["inp"],
    datasets["test"][-1]["inp"],
]
retrieved_motions = retrieve_motions(inp_list, parameters['device'])
clip_features1 = encode_motions(model, retrieved_motions[:, :, :, :], parameters['device'])
clip_features1.shape

torch.Size([4, 512])

In [8]:
# or use clip
clip_features2 = []
for text in [
    "jump",
    "old walk",
    "drunk walk",
]:
    clip_tokens = clip.tokenize(text).to(parameters['device'])
    clip_features2.append(model.clip_model.encode_text(clip_tokens).float())
clip_features2.append(clip_features2[0] + clip_features2[1] - clip_features2[2])
clip_features2 = torch.cat(clip_features2)
clip_features2.shape

torch.Size([4, 512])

In [9]:
all_clip_features = [
    clip_features1,
    clip_features2
]


all_clip_features = torch.transpose(torch.stack(all_clip_features, axis=0), 0, 1)
h, w = all_clip_features.shape[:2]
gendurations = torch.ones((h*w, 1), dtype=int) * parameters['num_frames']

# generate the repr (joints3D/pose etc)
model.eval()
with torch.no_grad():
    generation = model.generate(all_clip_features, gendurations,
                                is_amass=True,
                                is_clip_features=True)

for key, val in generation.items():
    if len(generation[key].shape) == 1:
        generation[key] = val.reshape(h, w)
    else:
        generation[key] = val.reshape(h, w, *val.shape[1:])


In [10]:
generation["output"].shape

torch.Size([4, 2, 25, 6, 60])

In [11]:
import src.utils.rotation_conversions as geometry

def inp_to_theta(inp):
    ret = inp.permute(2, 0, 1)

    # Separate rotations and translation
    rot_part = ret[:, :24, :]  # (nframes, 24, 6)

    # Revert rot6d to matrix to axis_angle
    matrix = geometry.rotation_6d_to_matrix(rot_part)
    axis_angle = geometry.matrix_to_axis_angle(matrix)  # (nframes, 24, 3)

    # Flatten to thetas
    return axis_angle.reshape(axis_angle.shape[0], -1)  # (nframes, 72)


In [12]:
import torch
from smplx import SMPL
import trimesh
import pyrender
import numpy as np
import imageio
from tqdm import tqdm


def thetas_to_video(motion, output_path):
    # Load SMPL model
    smpl_model = SMPL(model_path='./models/smpl')
    faces = smpl_model.faces

    scene = pyrender.Scene()
    r = pyrender.OffscreenRenderer(640, 480)

    # Add a camera
    camera = pyrender.PerspectiveCamera(yfov=np.pi / 3.0)
    cam_pose = np.array(
        # [
        #     [ 1.00, 0.05,-0.03,-0.10],
        #     [-0.05, 0.25,-0.97,-2.78],
        #     [-0.04, 0.97, 0.25, 0.61],
        #     [ 0.  , 0.  , 0.  , 1.  ]
        # ]
        # find camera by code from notebook 2.smpl_dataVisualization.ipynb
        [
            [ 9.98793959e-01,  1.16659486e-03,  4.90842806e-02, 1.18703522e-01],
            [-7.82186846e-04, -9.99212735e-01,  3.96648358e-02, -1.85050091e-01],
            [ 4.90919110e-02, -3.96553915e-02, -9.98006731e-01, -2.73471684e+00],
            [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, 1.00000000e+00]]
    )
    scene.add(camera, pose=cam_pose)

    # Add light
    light = pyrender.DirectionalLight(color=np.ones(3), intensity=3.0)
    scene.add(light, pose=cam_pose)

    # Suppose data["thetas"][0] has shape [T, 72]
    frames = []
    T = len(motion)
    if isinstance(motion, np.ndarray):
        motion = torch.from_numpy(motion)
    motion = motion.to(torch.float)
    for i in tqdm(range(T), desc="Rendering frames"):
        theta = motion[i:i+1]
        output = smpl_model(body_pose=theta[:, 3:], global_orient=theta[:, :3])

        vertices = output.vertices.detach().cpu().numpy().squeeze()
        mesh_visual = pyrender.Mesh.from_trimesh(trimesh.Trimesh(vertices, faces))

        # Clear old mesh, add new one
        scene.clear()
        scene.add(camera, pose=cam_pose)
        scene.add(light, pose=cam_pose)
        scene.add(mesh_visual)

        # Render and save frame
        color, _ = r.render(scene)
        frames.append(color)

    r.delete()


    # Make sure ffmpeg is installed: pip install imageio[ffmpeg]
    with imageio.get_writer(output_path, fps=30, format='ffmpeg') as writer:
        for frame in frames:
            writer.append_data(frame)

    print(f"✅ Saved video as {output_path}")

In [13]:
theta = inp_to_theta(generation["output"][3,0]).cpu()
thetas_to_video(theta, "../output/3.encode_decode_with_motionClip.mp4")

theta = inp_to_theta(generation["output"][3,1]).cpu()
thetas_to_video(theta, "../output/3.edit_with_motionClip.mp4")



Rendering frames: 100%|██████████| 60/60 [00:00<00:00, 105.14it/s]


✅ Saved video as ../output/3.encode_decode_with_motionClip.mp4


Rendering frames: 100%|██████████| 60/60 [00:00<00:00, 107.22it/s]


✅ Saved video as ../output/3.edit_with_motionClip.mp4
