Environment: `py39_torch271`

In [1]:
import hydra
from omegaconf import DictConfig
import torch

from pyrutils.torch.train_utils import train, save_checkpoint
from pyrutils.torch.multi_task import MultiTaskLossLearner
from vhoi.data_loading import load_training_data, select_model_data_feeder, select_model_data_fetcher
from vhoi.data_loading import determine_num_classes
from vhoi.losses import select_loss, decide_num_main_losses, select_loss_types, select_loss_learning_mask
from vhoi.models import select_model, load_model_weights
from vhoi.models_custom import TGGCN_Custom

import random
import numpy as np
import os
from pathlib import Path

from dataclasses import dataclass
from typing import Optional
from dacite import from_dict

seed = 42
random.seed(seed)   # Python的随机性
os.environ['PYTHONHASHSEED'] = str(seed)    # 设置Python哈希种子，为了禁止hash随机化，使得实验可复现
np.random.seed(seed)   # numpy的随机性
torch.manual_seed(seed)   # torch的CPU随机性，为CPU设置随机种子
torch.cuda.manual_seed(seed)   # torch的GPU随机性，为当前GPU设置随机种子
torch.cuda.manual_seed_all(seed)  # if you are using multi-GPU.   torch的GPU随机性，为所有GPU设置随机种子
torch.backends.cudnn.benchmark = False   # if benchmark=True, deterministic will be False
torch.backends.cudnn.deterministic = True   # 选择确定性算法

In [2]:
from dataclasses import dataclass, asdict

class DictMixin:
    def get(self, key, default_value=None):
        return getattr(self, key, default_value)

    def as_dict(self):
        return asdict(self)
    
@dataclass
class Resources(DictMixin):
    use_gpu: bool
    num_threads: int

@dataclass
class ModelMetadata(DictMixin):
    model_name: str
    input_type: str

@dataclass
class ModelParameters(DictMixin):
    add_segment_length: int
    add_time_position: int
    time_position_strategy: str
    positional_encoding_style: str
    attention_style: str
    bias: bool
    cat_level_states: int
    discrete_networks_num_layers: int
    discrete_optimization_strategy: str
    filter_discrete_updates: bool
    gcn_node: int
    hidden_size: int
    message_humans_to_human: bool
    message_human_to_objects: bool
    message_objects_to_human: bool
    message_objects_to_object: bool
    message_geometry_to_objects: bool
    message_geometry_to_human: bool
    message_segment: bool
    message_type: str
    message_granularity: str
    message_aggregation: str
    object_segment_update_strategy: str
    share_level_mlps: int
    update_segment_threshold: float


@dataclass
class ModelOptimization(DictMixin):
    batch_size: int
    clip_gradient_at: float
    epochs: int
    learning_rate: float
    val_fraction: float


@dataclass
class BudgetLoss(DictMixin):
    add: bool
    human_weight: float
    object_weight: float


@dataclass
class SegmentationLoss(DictMixin):
    add: bool
    pretrain: bool
    sigma: float
    weight: float


@dataclass
class ModelMisc(DictMixin):
    anticipation_loss_weight: float
    budget_loss: BudgetLoss
    first_level_loss_weight: float
    impose_segmentation_pattern: int
    input_human_segmentation: bool
    input_object_segmentation: bool
    make_attention_distance_based: bool
    multi_task_loss_learner: bool
    pretrained: bool
    pretrained_path: Optional[str]
    segmentation_loss: SegmentationLoss


@dataclass
class ModelLogging(DictMixin):
    root_log_dir: str
    checkpoint_name: str
    log_dir: str


@dataclass
class Models(DictMixin):
    metadata: ModelMetadata
    parameters: ModelParameters
    optimization: ModelOptimization
    misc: ModelMisc
    logging: ModelLogging

@dataclass
class Data(DictMixin):
    name: str
    path: str
    path_zarr: str
    path_obb_zarr: str
    path_hbb_zarr: str
    path_hps_zarr: str
    cross_validation_test_subject: str
    scaling_strategy: Optional[str]
    downsampling: int

@dataclass
class Config(DictMixin):
    resources: Resources
    models: Models
    data: Data

cfg_dict = {
    "resources": {
        "use_gpu": True,
        "num_threads": 32
    },
    "models": {
        "metadata": {
            "model_name": "2G-GCN",
            "input_type": "multiple"
        },
        "parameters": {
            "add_segment_length": 0,  # length of the segment to the segment-level rnn. 0 is off and 1 is on.
            "add_time_position": 0,  # absolute time position to the segment-level rnn. 0 is off and 1 is on.
            "time_position_strategy": "s",  # input time position to segment [s] or discrete update [u].
            "positional_encoding_style": "e",  # e [embedding] or p [periodic].
            "attention_style": "v3",  # v1 [concat], v2 [dot-product], v3 [scaled_dot-product], v4 [general]
            "bias": True,
            "cat_level_states": 0,  # concatenate first and second level hidden states for predictors MLPs.
            "discrete_networks_num_layers": 1,  # depth of the state change detector MLP.
            "discrete_optimization_strategy": "gs",  # straight-through [st] or gumbel-sigmoid [gs]
            "filter_discrete_updates": False,  # maxima filter for soft output of state change detector.
            "gcn_node": 26,  # 19 for cad120, 30 for bimanual, 26 for mphoi
            "hidden_size": 512,  # 512 for cad120 & mphoi; 64 for bimanual
            "message_humans_to_human": True,  # only meaningful for bimanual and mphoi
            "message_human_to_objects": True,
            "message_objects_to_human": True,
            "message_objects_to_object": True,
            "message_geometry_to_objects": True,
            "message_geometry_to_human": False,  # false in original, note this!
            "message_segment": True,
            "message_type": "v2",  # v1 [relational] or v2 [non-relational]
            "message_granularity": "v1",  # v1 [generic] or v2 [specific]
            "message_aggregation": "att",  # mean_pooling [mp] or attention [att]
            "object_segment_update_strategy": "ind",  # same_as_human [sah], independent [ind], or conditional_on_human [coh]
            "share_level_mlps": 0,  # whether to share [1] or not [0] the prediction MLPs of the levels.
            "update_segment_threshold": 0.5  # [0.0, 1.0)
        },
        "optimization": {
        "batch_size": 8,  # mphoi:8; cad120:16; bimanual: 32
        "clip_gradient_at": 0.0,
        "epochs": 40,  # cad120 & mphoi:40; bimanual: 60
        "learning_rate": 1e-4,  # mphoi:1e-4; cad120 & bimanual:1e-3
        "val_fraction": 0.1
    },
    "misc": {
        "anticipation_loss_weight": 1.0,
        "budget_loss": {
            "add": False,
            "human_weight": 1.0,
            "object_weight": 1.0
        },
        "first_level_loss_weight": 0.0,  # if positive, first level does frame-level prediction
        "impose_segmentation_pattern": 1,  # 0 [no pattern], 1 [all ones]
        "input_human_segmentation": False,  # (was "flase" in YAML, corrected here)
        "input_object_segmentation": False,
        "make_attention_distance_based": True,  # only meaningful if message_aggregation is attention
        "multi_task_loss_learner": False,
        "pretrained": False,  # unfortunately need two entries for checkpoint name
        "pretrained_path": None,  # specified parameters must match pre-trained model
        "segmentation_loss": {
            "add": False,
            "pretrain": False,
            "sigma": 0.0,  # Gaussian smoothing
            "weight": 1.0
        }
    },
    "logging": {
        "root_log_dir": "${oc.env:PWD}/outputs_hiergat/${data.name}/${models.metadata.model_name}",
        "checkpoint_name": (
            "hs${models.parameters.hidden_size}_e${models.optimization.epochs}_bs${models.optimization.batch_size}_"
            "lr${models.optimization.learning_rate}_${models.parameters.update_segment_threshold}_${data.cross_validation_test_subject}"
        ),
        "log_dir": "${models.logging.root_log_dir}/${models.logging.checkpoint_name}"
    },
    },
    "data": {
        "name": "mphoi",
        "path": f"{os.getcwd()}/data/MPHOI/MPHOI/mphoi_ground_truth_labels.json",
        "path_zarr": f"{os.getcwd()}/data/MPHOI/MPHOI/mphoi_derived_features/faster_rcnn.zarr",
        "path_obb_zarr": f"{os.getcwd()}/data/MPHOI/MPHOI/mphoi_derived_features/object_bounding_boxes.zarr",
        "path_hbb_zarr": f"{os.getcwd()}/data/MPHOI/MPHOI/mphoi_derived_features/human_bounding_boxes.zarr",
        "path_hps_zarr": f"{os.getcwd()}/data/MPHOI/MPHOI/mphoi_derived_features/human_pose.zarr",
        "cross_validation_test_subject": "Subject14",  # Subject45, Subject25, Subject14
        "scaling_strategy": None,  # null or "standard"
        "downsampling": 3  # 1 = full FPS, 2 = half FPS, ...
    },
}

cfg = from_dict(data_class=Config, data=cfg_dict)

In [3]:
seed = 42
torch.set_num_threads(cfg.resources.num_threads)
# Data
model_name, model_input_type = cfg.models.metadata.model_name, cfg.models.metadata.input_type
batch_size, val_fraction = cfg.models.optimization.batch_size, cfg.models.optimization.val_fraction
misc_dict = cfg.get('misc', default_value={})
sigma = misc_dict.get('segmentation_loss', {}).get('sigma', 0.0)
# train_loader, val_loader, data_info, scalers = load_training_data(cfg.data, model_name, model_input_type,
#                                                                   batch_size=batch_size,
#                                                                   val_fraction=val_fraction,
#                                                                   seed=seed, debug=False, sigma=sigma)

In [4]:
# load_mphoi_training_data(...)
import json

import zarr

from vhoi.data_loading import create_data_loader

test_subject_id = cfg.data.cross_validation_test_subject

with open(cfg.data.path, mode='rb') as f:
        data = json.load(f)
root = zarr.open(cfg.data.path_zarr, mode='r')
root_obbs = zarr.open(cfg.data.path_obb_zarr, mode='r')
root_hbbs = zarr.open(cfg.data.path_hbb_zarr, mode='r')
root_hps = zarr.open(cfg.data.path_hps_zarr, mode='r')
training_data = []
for video_id, human_ground_truth in data.items():
    subject_id, task, take = video_id.split(sep='-')
    first_sub, second_sub = int(subject_id[-2]), int(subject_id[-1])
    first_test_sub, second_test_sub = int(test_subject_id[-2]), int(test_subject_id[-1])
    if (first_sub-first_test_sub)*(second_sub-second_test_sub)*(first_sub-second_test_sub)*(second_sub-first_test_sub) == 0:
        continue
    Human1_features = root[video_id]['Human1'][:]   # 0: human 1 features       => [391, 2048]
    Human2_features = root[video_id]['Human2'][:]   # 1: human 2 features       => [391, 2048]
    object_features = root[video_id]['objects'][:]  # 2: object features        => [391, 4, 2048]
                                                    # 3: human ground truth     => {'Human1': list[391], 'Human2': list[391]}
    Human1_bbs = root_hbbs[video_id]['Human1'][:]   # 4: human 1 bounding boxes => [391, 4]
    Human2_bbs = root_hbbs[video_id]['Human2'][:]   # 5: human 2 bounding boxes => [391, 4]
    objects_bbs = root_obbs[video_id]['objects'][:] # 6: object bounding boxes  => [391, 4, 4]
    Human1_hps = root_hps[video_id]['Human1'][:]    # 7: human 1 poses          => [391, 32, 2]
    Human2_hps = root_hps[video_id]['Human2'][:]    # 8: human 2 poses          => [391, 32, 2]
    training_data.append([Human1_features, Human2_features, object_features, human_ground_truth,
                            Human1_bbs, Human2_bbs, objects_bbs, Human1_hps, Human2_hps])
# train_loader, scalers, _ = create_data_loader(training_data, model_name, model_input_type, 'mphoi',
#                                                   batch_size=batch_size, shuffle=True,
#                                                   scaling_strategy=None, sigma=sigma,
#                                                   downsampling=1, test_data=False)
# val_loader, _, _ = create_data_loader(val_data, model_name, model_input_type, 'mphoi', batch_size=len(val_data),
#                                         shuffle=False, scalers=scalers, sigma=sigma, downsampling=downsampling,
#                                         test_data=False)

# training_data info:
# 0: human 1 features       => [391, 2048]
# 1: human 2 features       => [391, 2048]
# 2: object features        => [391, 4, 2048]
# 3: human ground truth     => {'Human1': list[391], 'Human2': list[391]}
# 4: human 1 bounding boxes => [391, 4]
# 5: human 2 bounding boxes => [391, 4]
# 6: object bounding boxes  => [391, 4, 4]
# 7: human 1 poses          => [391, 32, 2]
# 8: human 2 poses          => [391, 32, 2]

# print("training_data:")
# for d in training_data:
#     for di in d:
#         if isinstance(di, np.ndarray):
#             print(di.shape)
#         elif isinstance(di, dict):
#             for diik, diiv in di.items():
#                 print(diik, "=>", len(diiv))
#         else:
#             print(di)
#     break

In [5]:
shuffle = True
scaling_strategy = None
scalers = None
test_data = False
downsampling = 1

In [6]:
# ================================================================================================================================
# CUSTOM
# ================================================================================================================================
human_features_list = []
human_boxes_list = []
human_poses_list = []

object_features_list = []
object_boxes_list = []

gt_list = []
xs_steps = []

for data_i in training_data:
    # ================================================================================================================================
    # HUMANS
    # ================================================================================================================================
    human_features_list.append([
        # ==== ORIGINAL ====
        data_i[0], 
        # data_i[1], 
        
        # ==== CUSTOM ====
        # data_i[0], data_i[1],
    ])
    human_boxes_list.append([
        # ==== ORIGINAL ====
        data_i[4], 
        # data_i[5],
        
        # ==== CUSOTM ====
        # data_i[4], data_i[5], 
    ])
    human_poses_list.append([
        # ==== ORIGINAL ====
        data_i[7], 
        # data_i[8], 
        
        # ==== CUSTOM ====
        # data_i[7], data_i[8],
    ])
    
    # ================================================================================================================================
    # OBJECTS
    # ================================================================================================================================
    object_features = data_i[2]
    # object_features_rand = np.random.randn(object_features.shape[0], 3, object_features.shape[2])
    # object_features = np.concatenate((object_features, object_features_rand), axis=1) 
    object_features = object_features[:, 0:1, :]
    
    object_boxes = data_i[6]
    # object_boxes_rand = np.random.randn(object_boxes.shape[0], 3, object_boxes.shape[2])
    # object_boxes = np.concatenate((object_boxes, object_boxes_rand), axis=1)
    object_boxes = object_boxes[:, 0:1, :]
    
    assert object_features.shape[:-1] == object_boxes.shape[:-1], f"object_features.shape[:-1]: {object_features.shape[:-1]} != object_boxes.shape[:-1]:{object_boxes.shape[:-1]}"
    
    object_features_list.append(object_features)
    object_boxes_list.append(object_boxes)
    
    # ================================================================================================================================
    # MISC.
    # ================================================================================================================================
    gt_list.append(
        {
            # ==== ORIGINAL ====
            'Human1': data_i[3]['Human1'],
            # 'Human2': data_i[3]['Human2'],
            
            # ==== CUSTOM ====
            # 'Human3': data_i[3]['Human1'],
            # 'Human4': data_i[3]['Human2'],
        }
    )
    
    ### assemble_num_steps(...)
    x = data_i[0]
    # print(x.shape)
    num_steps = len(x[downsampling - 1::downsampling])
    xs_steps.append(num_steps)
    
xs_steps = np.array(xs_steps, dtype=np.float32)

print("human_features_list[0][0]:", human_features_list[0][0].shape)
print("human_boxes_list[0][0]:", human_boxes_list[0][0].shape)
print("human_poses_list[0][0]:", human_poses_list[0][0].shape)
print()

print("object_features_list:", object_features_list[0].shape)
print("object_boxes_list:", object_boxes_list[0].shape)
print()

print("gt_list:", gt_list[0].keys(), gt_list[0]['Human1'])
print()

print("xs_steps:", xs_steps)

human_features_list[0][0]: (391, 2048)
human_boxes_list[0][0]: (391, 4)
human_poses_list[0][0]: (391, 32, 2)

object_features_list: (391, 1, 2048)
object_boxes_list: (391, 1, 4)

gt_list: dict_keys(['Human1']) [10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 1

In [7]:
from pyrutils.itertools import run_length_encoding
from vhoi.data_loading import segmentation_from_output_class

def assemble_mphoi_frame_level_recurrent_human(
    human_features_list, human_poses_list, object_boxes_list, gt_list,
    downsampling: int = 1, test_data: bool = False, max_no_objects: int = 4
):
    """
    Assemble recurrent human features for multiple humans in frame-level videos.

    Args:
        human_features_list:  list of list of human feature arrays per video
        human_poses_list:     list of list of human pose arrays per video
        object_boxes_list:    list of object bounding box arrays per video
        gt_list: list of dicts {"Human1": labels, "Human2": ..., }
        downsampling:         frame downsampling factor
        test_data:            if True, skip label slicing
        max_no_objects:       maximum number of objects in bounding boxes

    Returns:
        xs: [x_hs, x_hs_segmentation]
        ys: [y_rec_hs, y_pred_hs, y_hs_segmentation]
    """
    xs_h, xs_hp, x_obb = [], [], []
    max_len, max_len_downsampled = 0, 0

    # If not provided, infer maximum number of objects across all videos
    if max_no_objects is None:
        max_no_objects = max(
            max(len(frame) for frame in video) for video in object_boxes_list
        )

    # Loop over videos
    for humans, poses, objects_bounding_box in zip(human_features_list, human_poses_list, object_boxes_list):
        num_humans = len(humans)
        max_len = max(max_len, humans[0].shape[0])

        # Downsample humans/poses
        humans_ds = [h[downsampling - 1::downsampling] for h in humans]
        poses_ds  = [p[downsampling - 1::downsampling] / 1000 for p in poses]
        max_len_downsampled = max(max_len_downsampled, humans_ds[0].shape[0])
        xs_h.append(humans_ds)
        xs_hp.append(poses_ds)

        # Downsample objects
        obb_ds = objects_bounding_box[downsampling - 1::downsampling] / 1000
        x_obb.append(obb_ds)

    # Reshape object bounding boxes
    xs_obb = []
    for video in x_obb:
        bb = []
        for frame in video:
            b = np.zeros((max_no_objects, 4))
            n = min(len(frame), max_no_objects)
            b[:n] = frame[:n]     # pad if fewer, truncate if more
            b = b.reshape(max_no_objects * 2, 2)
            bb.append(b)
        xs_obb.append(bb)

    # Add context features to each human
    keypoints = [1, 2, 4, 6, 7, 11, 13, 14, 27]  # upper body keypoints
    xs_h_with_context = []
    for i, (humans_ds, poses_ds, obb_video) in enumerate(zip(xs_h, xs_hp, xs_obb)):
        num_humans = len(humans_ds)
        humans_context = [[] for _ in range(num_humans)]

        for j in range(len(humans_ds[0])):  # loop frames
            obb = obb_video[j]

            # Compute velocities
            if j + 1 < len(humans_ds[0]):
                next_poses = [p[j+1][keypoints] for p in poses_ds]
                pose_velos = [(next_pose - poses_ds[h][j][keypoints]) * 100 for h, next_pose in enumerate(next_poses)]
                obb_velo = (obb_video[j+1] - obb) * 100
            else:
                pose_velos = [np.zeros((len(keypoints), 2)) for _ in poses_ds]
                obb_velo = np.zeros((max_no_objects * 2, 2))

            obbvelo = np.hstack((obb, obb_velo)).reshape(1, -1)

            # Context per human
            context = []
            for h in range(num_humans):
                pose = poses_ds[h][j][keypoints]
                velo = pose_velos[h]
                posevelo = np.hstack((pose, velo)).reshape(1, -1)
                context.append(posevelo[0])
            # Flatten [h1, h2, ..., obb] into context vector
            context = np.concatenate(context + [obbvelo[0]])

            # Concatenate context to each human’s own frame features
            for h in range(num_humans):
                h_con = np.concatenate((humans_ds[h][j], context))
                humans_context[h].append(h_con)

        xs_h_with_context.append([np.array(hc) for hc in humans_context])

    # Stack humans into one array
    feature_size = xs_h_with_context[0][0].shape[-1]
    num_humans = len(xs_h_with_context[0])
    x_hs = np.full([len(xs_h_with_context), max_len_downsampled, num_humans, feature_size],
                   fill_value=np.nan, dtype=np.float32)

    for m, humans in enumerate(xs_h_with_context):
        for h, feats in enumerate(humans):
            x_hs[m, :len(feats), h] = feats

    xs = [x_hs]

    # Output labels
    y_rec_hs = np.full([len(x_hs), max_len, num_humans], fill_value=-1, dtype=np.int64)
    y_pred_hs = np.full_like(y_rec_hs, fill_value=-1)

    for m, video_hands_ground_truth in enumerate(gt_list):
        for h in range(num_humans):
            human_key = f"Human{h+1}"
            if human_key not in video_hands_ground_truth:
                continue
            y_h = video_hands_ground_truth[human_key]
            y_rec_hs[m, :len(y_h), h] = y_h
            rle = list(run_length_encoding(y_h))
            y_h_p = []
            for (_, prev_len), (next_label, _) in zip(rle[:-1], rle[1:]):
                y_h_p += [next_label] * prev_len
            y_pred_hs[m, :len(y_h_p), h] = y_h_p

    x_hs_segmentation = segmentation_from_output_class(
        y_rec_hs[:, downsampling - 1::downsampling],
        segmentation_type="input"
    )
    xs.append(x_hs_segmentation)

    if not test_data:
        y_rec_hs = y_rec_hs[:, downsampling - 1::downsampling]
        y_pred_hs = y_pred_hs[:, downsampling - 1::downsampling]

    y_hs_segmentation = segmentation_from_output_class(y_rec_hs, segmentation_type="output")
    ys = [y_rec_hs, y_pred_hs, y_hs_segmentation]

    return xs, ys

xs, ys = assemble_mphoi_frame_level_recurrent_human(human_features_list, human_poses_list, object_boxes_list, gt_list)

for xs_i, ys_i in zip(xs, ys):
    print("xs_i.shape:", xs_i.shape)
    print("ys_i.shape:", ys_i.shape, ys_i.dtype, ys_i.max().item(), ys_i.min().item())
    print()

xs_i.shape: (27, 464, 1, 2116)
ys_i.shape: (27, 464, 1) int64 12 -1

xs_i.shape: (27, 464, 1)
ys_i.shape: (27, 464, 1) int64 12 -1



In [8]:
def assemble_mphoi_frame_level_recurrent_objects(object_features_list, downsampling: int = 1):
    xs_objects = []
    max_len, max_len_downsampled, max_num_objects = 0, 0, 0
    for objects in object_features_list:
        max_len = max(max_len, objects.shape[0])
        max_num_objects = max(max_num_objects, objects.shape[1])
        objects = objects[downsampling - 1::downsampling]
        max_len_downsampled = max(max_len_downsampled, objects.shape[0])
        xs_objects.append(objects)
    feature_size = xs_objects[-1].shape[-1]
    x_objects = np.full([len(xs_objects), max_len_downsampled, max_num_objects, feature_size],
                        fill_value=np.nan, dtype=np.float32)
    x_objects_mask = np.zeros([len(xs_objects), max_num_objects], dtype=np.float32)
    for m, x_o in enumerate(xs_objects):
        x_objects[m, :x_o.shape[0], :x_o.shape[1], :] = x_o
        x_objects_mask[m, :x_o.shape[1]] = 1.0
    xs = [x_objects, x_objects_mask]
    return xs

xs_objects = assemble_mphoi_frame_level_recurrent_objects(object_features_list)
for xs_i in xs_objects:
    print(xs_i.shape, type(xs_i))

(27, 464, 1, 2048) <class 'numpy.ndarray'>
(27, 1) <class 'numpy.ndarray'>


In [9]:
import itertools

from vhoi.data_loading import compute_centroid

def assemble_mphoi_human_human_distances(human_boxes_list, downsampling: int = 1):
    """
    Compute pairwise human-human distances for multiple humans across videos.

    Args:
        human_boxes_list: list of list of human bounding boxes per video
                          (outer list: videos, inner list: humans, array: frames x 4)
        downsampling:     frame downsampling factor

    Returns:
        x_hh_dists: tensor [num_videos, max_len, N, N] with pairwise distances
    """
    mphoi_dims = np.array([3840, 2160], dtype=np.float32)
    max_len, max_num_humans = 0, 0
    all_dists = []

    for video_bbs in human_boxes_list:
        num_humans = len(video_bbs)
        max_num_humans = max(max_num_humans, num_humans)

        # Downsample and compute centroids
        centroids = []
        for bb in video_bbs:
            bb = bb[downsampling - 1::downsampling]
            c = compute_centroid(bb) / mphoi_dims
            centroids.append(c)

        # Length of this video (frames)
        max_len = max(max_len, centroids[0].shape[0])

        # Compute pairwise distances (frames x N x N)
        T = centroids[0].shape[0]
        dists_matrix = np.zeros((T, num_humans, num_humans), dtype=np.float32)

        for i, j in itertools.combinations(range(num_humans), 2):
            d = np.linalg.norm(centroids[i] - centroids[j], ord=2, axis=-1)
            dists_matrix[:, i, j] = d
            dists_matrix[:, j, i] = d

        all_dists.append(dists_matrix)

    # Pad into a tensor [num_videos, max_len, max_num_humans, max_num_humans]
    tensor_shape = [len(all_dists), max_len, max_num_humans, max_num_humans]
    x_hh_dists = np.full(tensor_shape, fill_value=np.nan, dtype=np.float32)

    for m, dists_matrix in enumerate(all_dists):
        T, N, _ = dists_matrix.shape
        x_hh_dists[m, :T, :N, :N] = dists_matrix

    return x_hh_dists


xs_hh_dists = assemble_mphoi_human_human_distances(human_boxes_list)
print(xs_hh_dists.shape)

(27, 464, 1, 1)


In [10]:
def assemble_mphoi_human_object_distances(human_boxes_list, object_boxes_list, downsampling: int = 1):
    """
    Compute human-object distances for multiple humans and objects across videos.

    Args:
        human_boxes_list:  list of list of human bounding boxes per video
                           (outer list: videos, inner list: humans, array: frames x 4)
        object_boxes_list: list of object bounding box arrays per video (frames x num_objects x 4)
        downsampling:      frame downsampling factor

    Returns:
        x_ho_dists: tensor [num_videos, max_len, max_num_humans, max_num_objects]
    """
    mphoi_dims = np.array([3840, 2160], dtype=np.float32)
    max_len, max_num_humans, max_num_objects = 0, 0, 0
    all_dists = []

    for video_bbs, obj_bbs in zip(human_boxes_list, object_boxes_list):
        num_humans = len(video_bbs)

        # Downsample humans → centroids
        human_centroids = []
        for bb in video_bbs:
            bb = bb[downsampling - 1::downsampling]
            c = compute_centroid(bb) / mphoi_dims
            human_centroids.append(c)

        # Downsample objects → centroids
        obj_bbs = obj_bbs[downsampling - 1::downsampling]
        obj_centroids = compute_centroid(obj_bbs) / mphoi_dims

        T = obj_centroids.shape[0]
        max_len = max(max_len, T)
        max_num_humans = max(max_num_humans, num_humans)
        max_num_objects = max(max_num_objects, obj_centroids.shape[1])

        # Compute distances [frames, num_humans, num_objects]
        dists_matrix = np.zeros((T, num_humans, obj_centroids.shape[1]), dtype=np.float32)
        for h, h_c in enumerate(human_centroids):
            d = np.linalg.norm(obj_centroids - np.expand_dims(h_c, axis=1), ord=2, axis=-1)
            dists_matrix[:, h, :] = d

        all_dists.append(dists_matrix)

    # Pad into a tensor [num_videos, max_len, max_num_humans, max_num_objects]
    tensor_shape = [len(all_dists), max_len, max_num_humans, max_num_objects]
    x_ho_dists = np.full(tensor_shape, fill_value=np.nan, dtype=np.float32)

    for m, dists_matrix in enumerate(all_dists):
        T, H, O = dists_matrix.shape
        x_ho_dists[m, :T, :H, :O] = dists_matrix

    return x_ho_dists

xs_ho_dists = assemble_mphoi_human_object_distances(human_boxes_list, object_boxes_list)
print(xs_ho_dists.shape)

(27, 464, 1, 1)


In [11]:
import numpy as np

def assemble_mphoi_object_object_distances(object_boxes_list, downsampling: int = 1):
    """
    Compute pairwise object-object distances across videos.

    Args:
        object_boxes_list: list of object bounding box arrays per video (frames x num_objects x 4)
        downsampling:      frame downsampling factor

    Returns:
        x_oo_dists: tensor [num_videos, max_len, max_num_objects, max_num_objects]
    """
    mphoi_dims = np.array([3840, 2160], dtype=np.float32)
    max_len, max_num_objects = 0, 0
    all_dists = []

    for obj_bbs in object_boxes_list:
        # Downsample and compute centroids
        obj_bbs = obj_bbs[downsampling - 1::downsampling]
        objs_centroid = compute_centroid(obj_bbs) / mphoi_dims   # (frames, num_objects, 2)
        num_objects = objs_centroid.shape[1]

        # Compute pairwise distances per frame
        dists = []
        for k in range(num_objects):
            kth_object_centroid = objs_centroid[:, k:k+1]  # (frames, 1, 2)
            kth_dist = np.linalg.norm(objs_centroid - kth_object_centroid, ord=2, axis=-1)  # (frames, num_objects)
            dists.append(kth_dist)

        dists = np.stack(dists, axis=1)  # (frames, num_objects, num_objects)
        all_dists.append(dists)

        max_len = max(max_len, obj_bbs.shape[0])
        max_num_objects = max(max_num_objects, num_objects)

    # Pad into tensor [num_videos, max_len, max_num_objects, max_num_objects]
    tensor_shape = [len(all_dists), max_len, max_num_objects, max_num_objects]
    x_oo_dists = np.full(tensor_shape, fill_value=np.nan, dtype=np.float32)

    for m, dists in enumerate(all_dists):
        T, O1, O2 = dists.shape
        x_oo_dists[m, :T, :O1, :O2] = dists

    return x_oo_dists

xs_oo_dists = assemble_mphoi_object_object_distances(object_boxes_list)
print(xs_oo_dists.shape)

(27, 464, 1, 1)


In [12]:
# xs info:
# 0: x_human                    => [8, 154, 2, 2152]
# 1: x_objects                  => [8, 154, 4, 2048]
# 2: objects_mask               => [8, 4]
# 3: human_segmentation         => [8, 154, 2]
# 4: human_human_distances      => [8, 154, 2, 2]
# 5: human_object_distances     => [8, 154, 2, 4]
# 6: object_object_distances    => [8, 154, 4, 4]
# 7: steps_per_example          => [8]
# see ./vhoi/models.py, line 612, for more info

#    0,       1, 2,        3,        4,           5,           6,           7
# xs = xs[:1] + xs_objects + xs[1:] + [xs_hh_dists, xs_ho_dists, xs_oo_dists, xs_steps]
# for xsi in xs:
#     print(xsi.shape)

In [13]:
### create_data_loader(...)
from vhoi.data_loading import (
    # assemble_mphoi_tensors, 
    assemble_mphoi_frame_level_recurrent_objects, 
    ignore_last_step_end_flag_general,
    smooth_segmentation,
    assemble_mphoi_human_human_distances,
    assemble_mphoi_human_object_distances,
    assemble_mphoi_object_object_distances,
    assemble_num_steps,
)

data = training_data

# if dataset_name.lower() == 'cad120':
#     x, y = assemble_tensors(data, model_name, model_input_type, sigma=sigma, downsampling=downsampling,
#                             test_data=test_data)
# elif dataset_name.lower() == 'mphoi':
# x, y = assemble_mphoi_tensors(data=training_data, model_name=model_name, sigma=sigma, downsampling=downsampling, test_data=test_data)

### assemble_mphoi_tensors(...)
# from vhoi.data_loading import assemble_mphoi_frame_level_recurrent_human
# xs, ys = assemble_mphoi_frame_level_recurrent_human(data, downsampling=downsampling, test_data=test_data)

# xs (assemble_mphoi_frame_level_recurrent_human) info:
# 0: x_human            => [8, 154, 2, 2152]
# 1: human_segmentation => [8, 154, 2]

# xs_objects = assemble_mphoi_frame_level_recurrent_objects(data, downsampling=downsampling)
# if model_name == '2G-GCN':
if sigma:
    ys[2] = ignore_last_step_end_flag_general(ys[2])
ys[2] = smooth_segmentation(ys[2], sigma)
ys_budget = ys[2]
# xs_hh_dists = assemble_mphoi_human_human_distances(data, downsampling=downsampling)
# xs_ho_dists = assemble_mphoi_human_object_distances(data, downsampling=downsampling)
# xs_oo_dists = assemble_mphoi_object_object_distances(data, downsampling=downsampling)
# xs_steps = assemble_num_steps(data, downsampling=downsampling)

# print("xs:")
# for xsi in xs:
#     print(xsi.shape)
# print()

# xs info:
# 0: x_human                    => [8, 154, 2, 2152]
# 1: x_objects                  => [8, 154, 4, 2048]
# 2: objects_mask               => [8, 4]
# 3: human_segmentation         => [8, 154, 2]
# 4: human_human_distances      => [8, 154, 2, 2]
# 5: human_object_distances     => [8, 154, 2, 4]
# 6: object_object_distances    => [8, 154, 4, 4]
# 7: steps_per_example          => [8]
# see ./vhoi/models.py, line 612, for more info

#    0,       1, 2,        3,        4,           5,           6,           7
xs = xs[:1] + xs_objects + xs[1:] + [xs_hh_dists, xs_ho_dists, xs_oo_dists, xs_steps]
ys = [ys_budget] + ys[2:] + ys[:2]
ys += ys[-2:]
x, y = xs, ys

print("x:")
for xi in x:
    print(xi.shape)
print()

# else:
#     x, y = assemble_bimanual_tensors(data, model_name, sigma=sigma, downsampling=downsampling, test_data=test_data)

x:
(27, 464, 1, 2116)
(27, 464, 1, 2048)
(27, 1)
(27, 464, 1)
(27, 464, 1, 1)
(27, 464, 1, 1)
(27, 464, 1, 1)
(27,)



In [14]:
print("y:")
for y_i in y:
    print(y_i.shape)
print()

y:
(27, 464, 1)
(27, 464, 1)
(27, 464, 1)
(27, 464, 1)
(27, 464, 1)
(27, 464, 1)



In [15]:
from torch.utils.data import TensorDataset, DataLoader

from vhoi.data_loading import maybe_scale_input_tensors
from pyrutils.torch.train_utils import numpy_to_torch

x, y = xs, ys

x, scalers = maybe_scale_input_tensors(x, model_name, scaling_strategy=scaling_strategy, scalers=scalers)
x = [np.nan_to_num(ix, copy=False, nan=0.0) for ix in x]
x, y = numpy_to_torch(*x), numpy_to_torch(*y)

# print("x (numpy_to_torch):")
# for xi in x:
#     print(xi.shape)
# print()

dataset = TensorDataset(*(x + y))

print("dataset:")
for d in dataset.tensors:
    print(d.shape)
print()
    
data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=shuffle, num_workers=0,
                            pin_memory=False, drop_last=False)
# segmentations = assemble_segmentations(data, model_name, dataset_name=dataset_name)

print("data_loader")
for idx, i in enumerate(next(iter(data_loader))):
    print(i.shape, i.dtype)
    # if idx == 6:
    #     print()

dataset:
torch.Size([27, 464, 1, 2116])
torch.Size([27, 464, 1, 2048])
torch.Size([27, 1])
torch.Size([27, 464, 1])
torch.Size([27, 464, 1, 1])
torch.Size([27, 464, 1, 1])
torch.Size([27, 464, 1, 1])
torch.Size([27])
torch.Size([27, 464, 1])
torch.Size([27, 464, 1])
torch.Size([27, 464, 1])
torch.Size([27, 464, 1])
torch.Size([27, 464, 1])
torch.Size([27, 464, 1])

data_loader
torch.Size([8, 464, 1, 2116]) torch.float32
torch.Size([8, 464, 1, 2048]) torch.float32
torch.Size([8, 1]) torch.float32
torch.Size([8, 464, 1]) torch.float32
torch.Size([8, 464, 1, 1]) torch.float32
torch.Size([8, 464, 1, 1]) torch.float32
torch.Size([8, 464, 1, 1]) torch.float32
torch.Size([8]) torch.float32
torch.Size([8, 464, 1]) torch.float32
torch.Size([8, 464, 1]) torch.float32
torch.Size([8, 464, 1]) torch.int64
torch.Size([8, 464, 1]) torch.int64
torch.Size([8, 464, 1]) torch.int64
torch.Size([8, 464, 1]) torch.int64


In [16]:
from vhoi.data_loading import input_size_from_data_loader

train_loader = data_loader

input_size = input_size_from_data_loader(train_loader, model_name, model_input_type)
data_info = {'input_size': input_size}

In [17]:
# Model
### Model = select_model(model_name)
model_creation_args = cfg.models.parameters
### model_creation_args = {**data_info, **model_creation_args}
model_creation_args = {**data_info, **model_creation_args.__dict__}
dataset_name = cfg.data.name
num_classes = determine_num_classes(model_name, model_input_type, dataset_name)
model_creation_args['num_classes'] = num_classes
device = 'cuda' if torch.cuda.is_available() and cfg.resources.use_gpu else 'cpu'

# Disable geometry features, as they do not support more than 2 humans
model_creation_args['message_geometry_to_objects'] = True
model_creation_args['message_geometry_to_human'] = True

# Model configuration if object is only one
model_creation_args['message_objects_to_object'] = False
model_creation_args['gcn_node'] = 17

# Model configuration if human is only one
model_creation_args['message_humans_to_human'] = False

model = TGGCN_Custom(**model_creation_args).to(device)
if misc_dict.get('pretrained', False) and misc_dict.get('pretrained_path') is not None:
    state_dict = load_model_weights(misc_dict['pretrained_path'])
    model.load_state_dict(state_dict, strict=False)
params = model.parameters()
optimizer = torch.optim.Adam(params, lr=cfg.models.optimization.learning_rate)
criterion, loss_names = select_loss(model_name, model_input_type, dataset_name, cfg=cfg)
mtll_model = None
if misc_dict.get('multi_task_loss_learner', False):
    loss_types = select_loss_types(model_name, dataset_name, cfg=cfg)
    mask = select_loss_learning_mask(model_name, dataset_name, cfg=cfg)
    mtll_model = MultiTaskLossLearner(loss_types=loss_types, mask=mask).to(device)
    optimizer.add_param_group({'params': mtll_model.parameters()})

In [18]:
tensorboard_log_dir = cfg.models.logging.root_log_dir
checkpoint_name = cfg.models.logging.checkpoint_name
fetch_model_data = select_model_data_fetcher(model_name, model_input_type,
                                             dataset_name=dataset_name, **{**misc_dict, **cfg.models.parameters.__dict__})
feed_model_data = select_model_data_feeder(model_name, model_input_type, dataset_name=dataset_name, **misc_dict)
num_main_losses = decide_num_main_losses(model_name, dataset_name, {**misc_dict, **cfg.models.parameters.__dict__})
# checkpoint = train(model, train_loader, optimizer, criterion, cfg.optimization.epochs, device, loss_names,
#                    clip_gradient_at=cfg.optimization.clip_gradient_at,
#                    fetch_model_data=fetch_model_data, feed_model_data=feed_model_data,
#                    val_loader=val_loader, mtll_model=mtll_model, num_main_losses=num_main_losses,
#                    tensorboard_log_dir=tensorboard_log_dir, checkpoint_name=checkpoint_name)

In [19]:
# checkpoint_name = kwargs.get('checkpoint_name', None)
# tensorboard_log_dir = kwargs.get('tensorboard_log_dir', None)
# writer = None
# if tensorboard_log_dir is not None and checkpoint_name is not None:
#     writer = SummaryWriter(os.path.join(tensorboard_log_dir, 'runs', checkpoint_name))
checkpoint = {}
train_losses, val_losses, train_raw_losses, val_raw_losses = [], [], [], []
val_loss = float('Inf')

In [20]:
data_loader = train_loader
clip_gradient_at=0.0
log_interval=25

model.train()
if mtll_model is not None:
    mtll_model.train()
num_examples = len(data_loader.dataset)
for batch_idx, dataset in enumerate(data_loader):
    data, target = fetch_model_data(dataset, device=device)
    for t in target:
        print(t.shape, t.dtype, t.max().item(), t.min().item())
    print()
    optimizer.zero_grad()
    output = feed_model_data(model, data)
    for o in output:
        print(o.shape, o.dtype, o.max().item(), o.min().item())
    print()
    losses = criterion(output, target, reduction='mean')
    for l in losses:
        print(l)
    print()
    if mtll_model is not None:
        losses = mtll_model(losses)
    loss = sum(losses)
    loss.backward()
    if clip_gradient_at:
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=clip_gradient_at)
    optimizer.step()
    log_now, is_last_batch = (batch_idx % log_interval) == 0, batch_idx == (len(data_loader) - 1)
    if log_now or is_last_batch:
        num_main_losses = num_main_losses if num_main_losses is not None else len(losses)
        loss = sum(losses[-num_main_losses:])
        batch_initial_example_idx = min((batch_idx + 1) * data_loader.batch_size, num_examples)
        epoch_progress = 100 * (batch_idx + 1) / len(data_loader)
        print(f'(Train) Batch [{batch_initial_example_idx:6d}/{num_examples:6d} ({epoch_progress:3.0f}%)] ',
                f'Loss: {loss.item(): 8.4f}', end='')
        for loss_name, single_loss in zip(loss_names, losses):
            print(f'  {loss_name}: {single_loss: 6.4f}', end='')
        print()
        print()
    
    # Test for only single batch
    # break

torch.Size([8, 464, 1]) torch.float32 1.0 -1.0
torch.Size([8, 464, 1]) torch.float32 1.0 -1.0
torch.Size([8, 464, 1]) torch.int64 12 -1
torch.Size([8, 464, 1]) torch.int64 12 -1
torch.Size([8, 464, 1]) torch.int64 12 -1
torch.Size([8, 464, 1]) torch.int64 12 -1



torch.Size([8, 464, 1]) torch.float32 1.0 0.0
torch.Size([8, 464, 1]) torch.float32 0.9998103976249695 5.2600487833842635e-05
torch.Size([8, 13, 464, 1]) torch.float32 -2.1928343772888184 -2.9663479328155518
torch.Size([8, 13, 464, 1]) torch.float32 -2.199812173843384 -3.0870087146759033
torch.Size([8, 13, 464, 1]) torch.float32 -2.434875726699829 -2.7114181518554688
torch.Size([8, 13, 464, 1]) torch.float32 -2.4137790203094482 -2.694533348083496

tensor(0., device='cuda:0', grad_fn=<MulBackward0>)
tensor(0., device='cuda:0', grad_fn=<MulBackward0>)
tensor(0., device='cuda:0', grad_fn=<MulBackward0>)
tensor(0., device='cuda:0', grad_fn=<MulBackward0>)
tensor(2.5714, device='cuda:0', grad_fn=<MulBackward0>)
tensor(2.5326, device='cuda:0', grad_fn=<MulBackward0>)

  BCE_HS:  0.0000
  NLL_SAR_F:  0.0000
  NLL_SAP_F:  0.0000
  NLL_SAR:  2.5714
  NLL_SAP:  2.5326

torch.Size([8, 464, 1]) torch.float32 1.0 -1.0
torch.Size([8, 464, 1]) torch.float32 1.0 -1.0
torch.Size([8, 464, 1]) torch.int6