### Feed mixed input - trial notebook

1. Load model thru checkpoints, config & mint config utils
    - Similar to `evaluator.py`
1. Load tfrecord dataset as a starting point
1. Mix up inputs in the dataset to create a new input (or new dataset, whichever is easier)
1. Pass mixed input to model and see what it comes up with
1. Prepend original input motion sequence and visualize prediction vs targets
    - (targets being all the un-mixed inputs used to make the mixed input)

In [1]:
import os

from mint.core import inputs
from mint.core import model_builder
from mint.ctl import single_task_evaluator
from mint.utils import config_util
from third_party.tf_models import orbit
import tensorflow as tf

import ipywidgets as widgets
from IPython.display import display

import pprint
import hashlib
import pickle
import numpy as np
import copy
import vedo
from scipy.spatial.transform import Rotation as R
from smplx import SMPL
import time
import torch



embedWindow(verbose=True): could not load ipyvtklink try:
> pip install ipyvtklink


In [2]:
# layout helpers

layout_path_input = widgets.Layout(width='700px', height='40px')

# input widgets

wg_config_path = widgets.Text(
    value="./configs/motion_enc_pilot-audioseed-37_bsz8.config",
    placeholder="Path to config file",
    layout=layout_path_input,
)

wg_checkpoint_dir = widgets.Text(
    value="/srv/share4/anarayanan68/mint/_expts/motion_enc_pilot_bsz8_1GPU/checkpoints",
    placeholder="Checkpoint directory to restore model from",
    layout=layout_path_input,
)

wg_enc_pkl_path = widgets.Text(
    value="/srv/share4/anarayanan68/mint/_expts/motion_enc_pilot_bsz8_1GPU/enc_data.pkl",
    placeholder="Path to pkl file for motion name encoding",
    layout=layout_path_input,
)


# overall container

wg_container = widgets.VBox([
    widgets.HBox([
        widgets.Label("Path to config file:"),
        wg_config_path,
    ]),
    widgets.HBox([
        widgets.Label("Checkpoint dir:"),
        wg_checkpoint_dir,
    ]),
    widgets.HBox([
        widgets.Label("Path to encoding pkl file:"),
        wg_enc_pkl_path,
    ]),
])

display(wg_container)

VBox(children=(HBox(children=(Label(value='Path to config file:'), Text(value='./configs/motion_enc_pilot-audi…

In [3]:
wg_config_path.value, wg_checkpoint_dir.value, wg_enc_pkl_path.value

('./configs/motion_enc_pilot-audioseed-37_bsz8.config',
 '/coc/scratch/anarayanan68/mint/_expts/tvloss_overfit_l2_bsz8_1GPU/checkpoints',
 '/srv/share4/anarayanan68/mint/_expts/motion_enc_pilot_bsz8_1GPU/enc_data.pkl')

In [4]:
# Config read

configs = config_util.get_configs_from_pipeline_file(wg_config_path.value)
model_config = configs['model']
eval_config = configs['eval_config']
eval_dataset_config = configs['eval_dataset']

In [5]:
# Model build & restore

model = model_builder.build(model_config, is_training=False)   # even using True would work as the arg is unused

checkpoint_manager=tf.train.CheckpointManager(
    tf.train.Checkpoint(model=model),
    directory=wg_checkpoint_dir.value,
    max_to_keep=None)

checkpoint_path = checkpoint_manager.restore_or_initialize()

if checkpoint_path is not None:
    print(f"restored model from {checkpoint_path}.")
else:
    print("initialized model.")

restored model from /coc/scratch/anarayanan68/mint/_expts/tvloss_overfit_l2_bsz8_1GPU/checkpoints/ckpt-29999.


In [6]:
# Load original dataset
orig_dataset = inputs.create_input(
      train_eval_config=eval_config,
      dataset_config=eval_dataset_config,
      is_training=False,
      use_tpu=False,
      overfit_expt=True)

In [9]:
# see samples in dataset
for i,x in enumerate(orig_dataset):
    if i >= 2:
        break
    print(i, list(x.keys()))
    pprint.pprint(x)

0 ['audio_name', 'audio_sequence_shape', 'motion_name', 'motion_name_enc_shape', 'motion_sequence_shape', 'motion_input', 'target', 'actual_motion_input', 'audio_input']
{'actual_motion_input': <tf.Tensor: shape=(1, 120, 225), dtype=float32, numpy=
array([[[ 0.0000000e+00,  0.0000000e+00,  0.0000000e+00, ...,
          1.6797760e-04, -1.5142176e-03,  9.9999887e-01],
        [ 0.0000000e+00,  0.0000000e+00,  0.0000000e+00, ...,
          2.5011573e-04, -1.5279740e-03,  9.9999881e-01],
        [ 0.0000000e+00,  0.0000000e+00,  0.0000000e+00, ...,
          3.1027434e-04, -1.4991260e-03,  9.9999881e-01],
        ...,
        [ 0.0000000e+00,  0.0000000e+00,  0.0000000e+00, ...,
         -3.1654290e-03,  1.6448519e-03,  9.9999362e-01],
        [ 0.0000000e+00,  0.0000000e+00,  0.0000000e+00, ...,
         -3.3644640e-03,  1.7190618e-03,  9.9999285e-01],
        [ 0.0000000e+00,  0.0000000e+00,  0.0000000e+00, ...,
         -3.1808640e-03,  1.8863098e-03,  9.9999315e-01]]], dtype=float32)>,

**Each entry in the `orig_dataset` is the `inputs.py`-preprocessed dict of each `tfexample` written by `tools/preprocessing.py`**
<br>
So to make a new input, just create a dict that can be passed to the model just as `SingleTaskEvaluator` passes its `inputs`. No `tf` dataset necessary.
<br>
But, to compute the input, use some inputs from the `orig_dataset` and also the encoding scheme from `tools/preprocessing.py`. Also don't forget to use the actual motion inputs etc from there.

In [7]:
# Fns from preprocessing, for motion name encoding

def compute_hashed_name(seq_name):
    hash_str = hashlib.sha1(seq_name.encode('utf-8')).hexdigest()
    stride = 4
    hash_np = np.array([
        float(int(hash_str[i:i+stride], 16)) / 16**stride
        for i in range(0, len(hash_str), stride)
    ]).reshape((1, -1))
    return hash_np


def load_enc_pkl(pkl_path):
    res = None
    if pkl_path is not None and os.path.exists(pkl_path):
        with open(pkl_path, 'rb') as f:
            res = pickle.load(f)
    return res


def get_encoded_hash(hash_np, enc_pkl_data):
    w1 = enc_pkl_data['w1']
    b1 = enc_pkl_data['b1']
    w2 = enc_pkl_data['w2']
    b2 = enc_pkl_data['b2']
    enc_shape = enc_pkl_data['enc_shape']

    z1 = hash_np @ w1 + b1
    op = np.tanh(z1) @ w2 + b2

    return op.reshape(enc_shape)


def get_encoded_input(seq_name, enc_pkl_data):
    hash_np = compute_hashed_name(seq_name)
    return get_encoded_hash(hash_np, enc_pkl_data)
    

In [8]:
enc_pkl_data = load_enc_pkl(wg_enc_pkl_path.value)

In [9]:
it = iter(orig_dataset)
inp1 = next(it)
inp2 = next(it)

pprint.pprint(inp1)

{'actual_motion_input': <tf.Tensor: shape=(1, 120, 225), dtype=float32, numpy=
array([[[ 0.0000000e+00,  0.0000000e+00,  0.0000000e+00, ...,
          1.6797760e-04, -1.5142176e-03,  9.9999887e-01],
        [ 0.0000000e+00,  0.0000000e+00,  0.0000000e+00, ...,
          2.5011573e-04, -1.5279740e-03,  9.9999881e-01],
        [ 0.0000000e+00,  0.0000000e+00,  0.0000000e+00, ...,
          3.1027434e-04, -1.4991260e-03,  9.9999881e-01],
        ...,
        [ 0.0000000e+00,  0.0000000e+00,  0.0000000e+00, ...,
         -3.1654290e-03,  1.6448519e-03,  9.9999362e-01],
        [ 0.0000000e+00,  0.0000000e+00,  0.0000000e+00, ...,
         -3.3644640e-03,  1.7190618e-03,  9.9999285e-01],
        [ 0.0000000e+00,  0.0000000e+00,  0.0000000e+00, ...,
         -3.1808640e-03,  1.8863098e-03,  9.9999315e-01]]], dtype=float32)>,
 'audio_input': <tf.Tensor: shape=(1, 240, 35), dtype=float32, numpy=
array([[[  -66.404236,  -187.17056 , -1871.9159  , ...,  -977.1947  ,
         -1992.3997  , -1325.

In [10]:
pprint.pprint(inp2)

{'actual_motion_input': <tf.Tensor: shape=(1, 120, 225), dtype=float32, numpy=
array([[[ 0.0000000e+00,  0.0000000e+00,  0.0000000e+00, ...,
         -1.4138616e-03, -4.8556848e-04,  9.9999887e-01],
        [ 0.0000000e+00,  0.0000000e+00,  0.0000000e+00, ...,
         -1.4673109e-03, -4.4936457e-04,  9.9999881e-01],
        [ 0.0000000e+00,  0.0000000e+00,  0.0000000e+00, ...,
         -1.3489244e-03, -3.8303231e-04,  9.9999905e-01],
        ...,
        [ 0.0000000e+00,  0.0000000e+00,  0.0000000e+00, ...,
         -7.1387948e-04,  4.3646788e-04,  9.9999964e-01],
        [ 0.0000000e+00,  0.0000000e+00,  0.0000000e+00, ...,
         -1.4102898e-03,  7.3639827e-04,  9.9999875e-01],
        [ 0.0000000e+00,  0.0000000e+00,  0.0000000e+00, ...,
         -1.2916403e-03,  9.9289301e-04,  9.9999869e-01]]], dtype=float32)>,
 'audio_input': <tf.Tensor: shape=(1, 240, 35), dtype=float32, numpy=
array([[[  117.505455,    36.568863, -1092.5334  , ...,  -492.89874 ,
         -1173.2806  ,  -726.

In [11]:
# Sanity check: passing actual input and saving output
op_inp1 = model(inp1)
np.save("./_expts/op_inp1.npy", op_inp1)

In [12]:
op_inp2 = model(inp2)
np.save("./_expts/op_inp2.npy", op_inp2)

In [13]:
display(inp1['target'].shape)
display(inp1['audio_name'], inp1['motion_name'])

TensorShape([1, 60, 225])

<tf.Tensor: shape=(1,), dtype=string, numpy=array([b'mWA2'], dtype=object)>

<tf.Tensor: shape=(1,), dtype=string, numpy=array([b'gWA_sFM_cAll_d27_mWA2_ch17'], dtype=object)>

In [14]:
display(inp2['target'].shape)
display(inp2['audio_name'], inp2['motion_name'])

TensorShape([1, 60, 225])

<tf.Tensor: shape=(1,), dtype=string, numpy=array([b'mWA4'], dtype=object)>

<tf.Tensor: shape=(1,), dtype=string, numpy=array([b'gWA_sFM_cAll_d27_mWA4_ch19'], dtype=object)>

**Expt 1: Midpt *after* encoding**

In [15]:
motion_inp = 0.5 * (inp1['motion_input'] + inp2['motion_input'])
motion_inp

<tf.Tensor: shape=(1, 120, 225), dtype=float32, numpy=
array([[[ 0.        ,  0.        ,  0.        , ..., 14.789306  ,
         -3.7476146 ,  4.396989  ],
        [ 0.        ,  0.        ,  0.        , ...,  2.0571365 ,
          0.6384883 , -0.18639994],
        [ 0.        ,  0.        ,  0.        , ..., 11.601599  ,
         -5.2563176 ,  1.5932057 ],
        ...,
        [ 0.        ,  0.        ,  0.        , ...,  2.1770396 ,
          2.1446245 , -9.1940155 ],
        [ 0.        ,  0.        ,  0.        , ...,  1.3384356 ,
          3.8624597 , -1.4457078 ],
        [ 0.        ,  0.        ,  0.        , ..., -3.1468363 ,
         -5.333867  , 18.754942  ]]], dtype=float32)>

In [11]:
list(inp1.keys())

['audio_name',
 'audio_sequence_shape',
 'motion_name',
 'motion_name_enc_shape',
 'motion_sequence_shape',
 'motion_input',
 'target',
 'actual_motion_input',
 'audio_input']

In [16]:
mix_inp__after_enc = copy.deepcopy(inp1)
mix_inp__after_enc['motion_input'] = motion_inp

pprint.pprint(mix_inp__after_enc)

{'actual_motion_input': <tf.Tensor: shape=(1, 120, 225), dtype=float32, numpy=
array([[[ 0.0000000e+00,  0.0000000e+00,  0.0000000e+00, ...,
          1.6797760e-04, -1.5142176e-03,  9.9999887e-01],
        [ 0.0000000e+00,  0.0000000e+00,  0.0000000e+00, ...,
          2.5011573e-04, -1.5279740e-03,  9.9999881e-01],
        [ 0.0000000e+00,  0.0000000e+00,  0.0000000e+00, ...,
          3.1027434e-04, -1.4991260e-03,  9.9999881e-01],
        ...,
        [ 0.0000000e+00,  0.0000000e+00,  0.0000000e+00, ...,
         -3.1654290e-03,  1.6448519e-03,  9.9999362e-01],
        [ 0.0000000e+00,  0.0000000e+00,  0.0000000e+00, ...,
         -3.3644640e-03,  1.7190618e-03,  9.9999285e-01],
        [ 0.0000000e+00,  0.0000000e+00,  0.0000000e+00, ...,
         -3.1808640e-03,  1.8863098e-03,  9.9999315e-01]]], dtype=float32)>,
 'audio_input': <tf.Tensor: shape=(1, 240, 35), dtype=float32, numpy=
array([[[  -66.404236,  -187.17056 , -1871.9159  , ...,  -977.1947  ,
         -1992.3997  , -1325.

In [17]:
op = model(mix_inp__after_enc)
pprint.pprint(op)   # shape 360 = sum of motion input len (120) and audio input len (240)

<tf.Tensor: shape=(1, 360, 225), dtype=float32, numpy=
array([[[-1.8552907e-01,  6.1986886e-04, -8.9536384e-02, ...,
         -1.4867255e-01,  1.5773547e-01,  7.6246375e-01],
        [ 1.8293066e-01, -4.5286570e-04,  2.8150442e-01, ...,
          9.9786207e-02, -3.6589332e-02,  1.0474453e+00],
        [ 1.5402207e-02,  8.7621577e-02,  8.8924766e-02, ...,
         -1.1416654e-01,  2.9354692e-01,  8.7109554e-01],
        ...,
        [-7.6220062e+01, -1.2719684e+01, -2.3883379e+01, ...,
          1.2643364e+02,  7.5383514e+01, -9.4295639e+01],
        [ 6.1439075e+01,  1.3702469e+02, -5.5815212e+01, ...,
          1.4889339e+02,  1.8936537e+01, -2.1564900e+02],
        [ 2.9349365e+01,  2.0578189e+02, -6.3144634e+01, ...,
         -5.9076748e+01,  4.4466381e+01, -1.4979886e+02]]], dtype=float32)>


In [18]:
np.save("./_expts/mix_op__after_enc.npy", op)

**Expt 2: Midpt *before* encoding - at the hash**

In [19]:
name1, name2 = bytes.decode(inp1['motion_name'].numpy().item()), bytes.decode(inp2['motion_name'].numpy().item())
display(name1, name2, type(name1))

'gWA_sFM_cAll_d27_mWA2_ch17'

'gWA_sFM_cAll_d27_mWA4_ch19'

str

In [20]:
name_hash_1, name_hash_2 = compute_hashed_name(name1), compute_hashed_name(name2)
name_hash_1, name_hash_2

(array([[0.09207153, 0.93508911, 0.37107849, 0.88606262, 0.72732544,
         0.67869568, 0.9442749 , 0.04432678, 0.53126526, 0.56632996]]),
 array([[0.95883179, 0.61355591, 0.09431458, 0.64544678, 0.96304321,
         0.27253723, 0.58055115, 0.97114563, 0.44119263, 0.12939453]]))

In [21]:
mix_hash = 0.5 * (name_hash_1 + name_hash_2)
mix_hash

array([[0.52545166, 0.77432251, 0.23269653, 0.7657547 , 0.84518433,
        0.47561646, 0.76241302, 0.50773621, 0.48622894, 0.34786224]])

In [22]:
motion_inp = get_encoded_hash(mix_hash, enc_pkl_data)
motion_inp.shape

(256, 219)

In [23]:
# Sanity check with input hashes
enc_hash_inp1 = get_encoded_hash(name_hash_1, enc_pkl_data)
enc_hash_inp2 = get_encoded_hash(name_hash_2, enc_pkl_data)

# inputs have padding & batch dim
assert np.all(inp1['motion_input'].numpy()[0,:,:6] == 0)
assert np.all(inp2['motion_input'].numpy()[0,:,:6] == 0)

assert np.allclose(enc_hash_inp1[:120], inp1['motion_input'].numpy()[0,:,6:])   
assert np.allclose(enc_hash_inp2[:120], inp2['motion_input'].numpy()[0,:,6:])

assert not np.allclose(motion_inp, enc_hash_inp1)
assert not np.allclose(motion_inp, enc_hash_inp2)
assert not np.allclose(motion_inp, 0.5 * (enc_hash_inp1 + enc_hash_inp2))

In [24]:
# matching input dims
motion_inp = np.pad(motion_inp, [[0,0], [6,0]])
motion_inp = motion_inp[None, :120]
motion_inp.shape

(1, 120, 225)

In [25]:
mix_inp__before_enc_hash = copy.deepcopy(inp1)
mix_inp__before_enc_hash['motion_input'] = motion_inp

op = model(mix_inp__before_enc_hash)
np.save("./_expts/mix_op__before_enc_hash.npy", op)