### Feed mixed input - trial notebook

1. Load model thru checkpoints, config & mint config utils
    - Similar to `evaluator.py`
1. Load tfrecord dataset as a starting point
1. Mix up inputs in the dataset to create a new input (or new dataset, whichever is easier)
1. Pass mixed input to model and see what it comes up with
1. Prepend original input motion sequence and visualize prediction vs targets
    - (targets being all the un-mixed inputs used to make the mixed input)

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os

from mint.core import inputs
from mint.core import model_builder
from mint.ctl import single_task_evaluator
from mint.utils import config_util
from third_party.tf_models import orbit
import tensorflow as tf

import ipywidgets as widgets
from IPython.display import display

import pprint
import hashlib
import pickle
import numpy as np
import copy
import vedo
from scipy.spatial.transform import Rotation as R
from smplx import SMPL
import time
import torch



embedWindow(verbose=True): could not load ipyvtklink try:
> pip install ipyvtklink


In [3]:
# layout helpers

layout_path_input = widgets.Layout(width='700px', height='40px')

# input widgets

wg_config_path = widgets.Text(
    value="./configs/motion_enc_pilot-audioseed-37_bsz8.config",
    placeholder="Path to config file",
    layout=layout_path_input,
)

wg_checkpoint_dir = widgets.Text(
    value="/srv/share4/anarayanan68/mint/_expts/motion_enc_pilot_bsz8_1GPU/checkpoints",
    placeholder="Checkpoint directory to restore model from",
    layout=layout_path_input,
)

wg_enc_pkl_path = widgets.Text(
    value="/srv/share4/anarayanan68/mint/_expts/motion_enc_pilot_bsz8_1GPU/enc_data.pkl",
    placeholder="Path to pkl file for motion name encoding",
    layout=layout_path_input,
)


# overall container

wg_container = widgets.VBox([
    widgets.HBox([
        widgets.Label("Path to config file:"),
        wg_config_path,
    ]),
    widgets.HBox([
        widgets.Label("Checkpoint dir:"),
        wg_checkpoint_dir,
    ]),
    widgets.HBox([
        widgets.Label("Path to encoding pkl file:"),
        wg_enc_pkl_path,
    ]),
])

display(wg_container)

VBox(children=(HBox(children=(Label(value='Path to config file:'), Text(value='./configs/motion_enc_pilot-audi…

In [4]:
wg_config_path.value, wg_checkpoint_dir.value, wg_enc_pkl_path.value

('./configs/onehot_tvloss_overfit-audioseed37_bsz8.config',
 '/coc/scratch/anarayanan68/mint/_expts/onehot_tvloss_overfit_bsz8_1GPU/checkpoints',
 '/coc/scratch/anarayanan68/mint/_expts/onehot_tvloss_overfit_bsz8_1GPU/enc_data.pkl')

In [5]:
# Config read

configs = config_util.get_configs_from_pipeline_file(wg_config_path.value)
model_config = configs['model']
eval_config = configs['eval_config']
eval_dataset_config = configs['eval_dataset']

In [6]:
# Model build & restore

model = model_builder.build(model_config, is_training=False)   # even using True would work as the arg is unused

checkpoint_manager=tf.train.CheckpointManager(
    tf.train.Checkpoint(model=model),
    directory=wg_checkpoint_dir.value,
    max_to_keep=None)

checkpoint_path = checkpoint_manager.restore_or_initialize()

if checkpoint_path is not None:
    print(f"restored model from {checkpoint_path}.")
else:
    print("initialized model.")

restored model from /coc/scratch/anarayanan68/mint/_expts/onehot_tvloss_overfit_bsz8_1GPU/checkpoints/ckpt-29999.


In [7]:
# Load original dataset
orig_dataset = inputs.create_input(
      train_eval_config=eval_config,
      dataset_config=eval_dataset_config,
      is_training=False,
      use_tpu=False,
      overfit_expt=True)

In [8]:
# see samples in dataset
for i,x in enumerate(orig_dataset):
    if i >= 2:
        break
    print(i, list(x.keys()))
    pprint.pprint(x)

0 ['audio_name', 'audio_sequence_shape', 'motion_name', 'motion_name_enc_shape', 'motion_sequence_shape', 'motion_input', 'target', 'actual_motion_input', 'audio_input']
{'actual_motion_input': <tf.Tensor: shape=(1, 120, 225), dtype=float32, numpy=
array([[[ 0.        ,  0.        ,  0.        , ..., -0.0012393 ,
         -0.00159104,  0.999998  ],
        [ 0.        ,  0.        ,  0.        , ..., -0.00129578,
         -0.00156013,  0.9999979 ],
        [ 0.        ,  0.        ,  0.        , ..., -0.00141662,
         -0.00139485,  0.99999803],
        ...,
        [ 0.        ,  0.        ,  0.        , ..., -0.00184672,
         -0.00517991,  0.99998486],
        [ 0.        ,  0.        ,  0.        , ..., -0.00180948,
         -0.00516688,  0.99998504],
        [ 0.        ,  0.        ,  0.        , ..., -0.00192659,
         -0.0050804 ,  0.9999852 ]]], dtype=float32)>,
 'audio_input': <tf.Tensor: shape=(1, 240, 35), dtype=float32, numpy=
array([[[ -117.623375,  -254.53906 , 

**Each entry in the `orig_dataset` is the `inputs.py`-preprocessed dict of each `tfexample` written by `tools/preprocessing.py`**
<br>
So to make a new input, just create a dict that can be passed to the model just as `SingleTaskEvaluator` passes its `inputs`. No `tf` dataset necessary.
<br>
But, to compute the input, use some inputs from the `orig_dataset` and also the encoding scheme from `tools/preprocessing.py`. Also don't forget to use the actual motion inputs etc from there.

In [9]:
# Fns from preprocessing, for motion name encoding

from tools.preprocessing import load_enc_pkl, get_latent_from_seq_name, encode_latent_vector, get_encoded_input

In [10]:
enc_pkl_data = load_enc_pkl(wg_enc_pkl_path.value)

In [11]:
it = iter(orig_dataset)
inp1 = next(it)
inp2 = next(it)

pprint.pprint(inp1)

{'actual_motion_input': <tf.Tensor: shape=(1, 120, 225), dtype=float32, numpy=
array([[[ 0.        ,  0.        ,  0.        , ..., -0.0012393 ,
         -0.00159104,  0.999998  ],
        [ 0.        ,  0.        ,  0.        , ..., -0.00129578,
         -0.00156013,  0.9999979 ],
        [ 0.        ,  0.        ,  0.        , ..., -0.00141662,
         -0.00139485,  0.99999803],
        ...,
        [ 0.        ,  0.        ,  0.        , ..., -0.00184672,
         -0.00517991,  0.99998486],
        [ 0.        ,  0.        ,  0.        , ..., -0.00180948,
         -0.00516688,  0.99998504],
        [ 0.        ,  0.        ,  0.        , ..., -0.00192659,
         -0.0050804 ,  0.9999852 ]]], dtype=float32)>,
 'audio_input': <tf.Tensor: shape=(1, 240, 35), dtype=float32, numpy=
array([[[ -117.623375,  -254.53906 , -2164.5752  , ..., -1150.2084  ,
         -2301.1704  , -1545.4581  ],
        [-1313.4275  ,  -562.8319  ,    83.6605  , ..., -1475.6674  ,
         -2226.9148  , -1312.

In [12]:
pprint.pprint(inp2)

{'actual_motion_input': <tf.Tensor: shape=(1, 120, 225), dtype=float32, numpy=
array([[[ 0.0000000e+00,  0.0000000e+00,  0.0000000e+00, ...,
         -9.8180119e-04,  1.8328466e-03,  9.9999785e-01],
        [ 0.0000000e+00,  0.0000000e+00,  0.0000000e+00, ...,
         -9.1753050e-04,  1.8183530e-03,  9.9999791e-01],
        [ 0.0000000e+00,  0.0000000e+00,  0.0000000e+00, ...,
         -9.1931503e-04,  1.7792224e-03,  9.9999797e-01],
        ...,
        [ 0.0000000e+00,  0.0000000e+00,  0.0000000e+00, ...,
          4.3062118e-04,  2.0214997e-03,  9.9999785e-01],
        [ 0.0000000e+00,  0.0000000e+00,  0.0000000e+00, ...,
          3.8123975e-04,  1.8915540e-03,  9.9999815e-01],
        [ 0.0000000e+00,  0.0000000e+00,  0.0000000e+00, ...,
          3.3898835e-04,  1.8229444e-03,  9.9999827e-01]]], dtype=float32)>,
 'audio_input': <tf.Tensor: shape=(1, 240, 35), dtype=float32, numpy=
array([[[  -66.404236,  -187.17056 , -1871.9159  , ...,  -977.1947  ,
         -1992.3997  , -1325.

In [13]:
# Sanity check: passing actual input and saving output
op_inp1 = model(inp1)
np.save("./_expts/op_inp1.npy", op_inp1)

In [14]:
op_inp2 = model(inp2)
np.save("./_expts/op_inp2.npy", op_inp2)

In [15]:
display(inp1['target'].shape)
display(inp1['audio_name'], inp1['motion_name'])

TensorShape([1, 60, 225])

<tf.Tensor: shape=(1,), dtype=string, numpy=array([b'mWA1'], dtype=object)>

<tf.Tensor: shape=(1,), dtype=string, numpy=array([b'gWA_sFM_cAll_d26_mWA1_ch09'], dtype=object)>

In [16]:
display(inp2['target'].shape)
display(inp2['audio_name'], inp2['motion_name'])

TensorShape([1, 60, 225])

<tf.Tensor: shape=(1,), dtype=string, numpy=array([b'mWA2'], dtype=object)>

<tf.Tensor: shape=(1,), dtype=string, numpy=array([b'gWA_sFM_cAll_d27_mWA2_ch21'], dtype=object)>

**Expt 1: Midpt *after* encoding**

In [17]:
motion_inp = 0.5 * (inp1['motion_input'] + inp2['motion_input'])
motion_inp

<tf.Tensor: shape=(1, 120, 225), dtype=float32, numpy=
array([[[ 0.        ,  0.        ,  0.        , ..., -1.8544956 ,
         -1.8805697 ,  2.2783737 ],
        [ 0.        ,  0.        ,  0.        , ..., -0.32219648,
          9.714643  ,  4.2514024 ],
        [ 0.        ,  0.        ,  0.        , ..., -5.0353975 ,
         -5.4542747 ,  4.2600684 ],
        ...,
        [ 0.        ,  0.        ,  0.        , ..., -0.09499395,
         -7.082777  ,  5.330182  ],
        [ 0.        ,  0.        ,  0.        , ...,  6.1061816 ,
          6.506633  ,  4.1002517 ],
        [ 0.        ,  0.        ,  0.        , ...,  7.4434986 ,
         -1.3700666 , -7.296976  ]]], dtype=float32)>

In [18]:
list(inp1.keys())

['audio_name',
 'audio_sequence_shape',
 'motion_name',
 'motion_name_enc_shape',
 'motion_sequence_shape',
 'motion_input',
 'target',
 'actual_motion_input',
 'audio_input']

In [19]:
mix_inp__after_enc = copy.deepcopy(inp1)
mix_inp__after_enc['motion_input'] = motion_inp

pprint.pprint(mix_inp__after_enc)

{'actual_motion_input': <tf.Tensor: shape=(1, 120, 225), dtype=float32, numpy=
array([[[ 0.        ,  0.        ,  0.        , ..., -0.0012393 ,
         -0.00159104,  0.999998  ],
        [ 0.        ,  0.        ,  0.        , ..., -0.00129578,
         -0.00156013,  0.9999979 ],
        [ 0.        ,  0.        ,  0.        , ..., -0.00141662,
         -0.00139485,  0.99999803],
        ...,
        [ 0.        ,  0.        ,  0.        , ..., -0.00184672,
         -0.00517991,  0.99998486],
        [ 0.        ,  0.        ,  0.        , ..., -0.00180948,
         -0.00516688,  0.99998504],
        [ 0.        ,  0.        ,  0.        , ..., -0.00192659,
         -0.0050804 ,  0.9999852 ]]], dtype=float32)>,
 'audio_input': <tf.Tensor: shape=(1, 240, 35), dtype=float32, numpy=
array([[[ -117.623375,  -254.53906 , -2164.5752  , ..., -1150.2084  ,
         -2301.1704  , -1545.4581  ],
        [-1313.4275  ,  -562.8319  ,    83.6605  , ..., -1475.6674  ,
         -2226.9148  , -1312.

In [20]:
op = model(mix_inp__after_enc)
pprint.pprint(op)   # shape 360 = sum of motion input len (120) and audio input len (240)

<tf.Tensor: shape=(1, 360, 225), dtype=float32, numpy=
array([[[ 2.19420135e-01,  1.05319418e-01,  1.24800466e-02, ...,
          5.99160604e-02, -1.57481223e-01,  1.07862961e+00],
        [ 5.71451709e-02,  2.84978151e-01,  1.22801930e-01, ...,
          3.24298620e-01, -2.31692374e-01,  8.86855125e-01],
        [ 1.29207015e-01, -8.50098357e-02, -7.02601969e-02, ...,
          1.12366292e-03, -3.22265804e-01,  1.01654541e+00],
        ...,
        [-2.17097916e+02, -1.61895615e+02,  1.05281868e+02, ...,
          1.71287556e+01, -1.68510651e+02, -3.18635040e+02],
        [ 5.00909004e+01,  6.67435608e+01, -4.03104591e+01, ...,
         -7.39148102e+01, -8.78099060e+01, -1.23307236e+02],
        [ 1.35903229e+02,  1.58804642e+02,  1.04776405e+02, ...,
         -1.36388611e+02,  2.91796613e+00, -1.19153114e+02]]],
      dtype=float32)>


In [21]:
np.save("./_expts/mix_op__after_enc.npy", op)

**Expt 2: Midpt *before* encoding**

In [22]:
name1, name2 = bytes.decode(inp1['motion_name'].numpy().item()), bytes.decode(inp2['motion_name'].numpy().item())
display(name1, name2, type(name1))

'gWA_sFM_cAll_d26_mWA1_ch09'

'gWA_sFM_cAll_d27_mWA2_ch21'

str

In [24]:
name_latent_1, name_latent_2 = get_latent_from_seq_name(name1, enc_pkl_data), get_latent_from_seq_name(name2, enc_pkl_data)
name_latent_1, name_latent_2

(array([0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0.]),
 array([0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0.]))

In [25]:
mix_latent = 0.5 * (name_latent_1 + name_latent_2)
mix_latent

array([0. , 0. , 0.5, 0. , 0. , 0. , 0. , 0. , 0. , 0.5, 0. , 0. , 0. ,
       0. , 0. , 0. , 0. , 0. , 0. , 0. ])

In [26]:
motion_inp = encode_latent_vector(mix_latent, enc_pkl_data)
motion_inp.shape

(256, 219)

In [27]:
# Sanity check with input hashes
enc_latent_inp1 = encode_latent_vector(name_latent_1, enc_pkl_data)
enc_latent_inp2 = encode_latent_vector(name_latent_2, enc_pkl_data)

# inputs have padding & batch dim
assert np.all(inp1['motion_input'].numpy()[0,:,:6] == 0)
assert np.all(inp2['motion_input'].numpy()[0,:,:6] == 0)

assert np.allclose(enc_latent_inp1[:120], inp1['motion_input'].numpy()[0,:,6:])   
assert np.allclose(enc_latent_inp2[:120], inp2['motion_input'].numpy()[0,:,6:])

assert not np.allclose(motion_inp, enc_latent_inp1)
assert not np.allclose(motion_inp, enc_latent_inp2)
assert not np.allclose(motion_inp, 0.5 * (enc_latent_inp1 + enc_latent_inp2))

In [28]:
# matching input dims
motion_inp = np.pad(motion_inp, [[0,0], [6,0]])
motion_inp = motion_inp[None, :120]
motion_inp.shape

(1, 120, 225)

In [29]:
mix_inp__before_enc = copy.deepcopy(inp1)
mix_inp__before_enc['motion_input'] = motion_inp

op = model(mix_inp__before_enc)
np.save("./_expts/mix_op__before_enc.npy", op)