In [None]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

# Imports

In [None]:
from IPython.core.display import display, HTML

display(HTML("<style>.container { width:90% !important; }</style>"))

import os, sys, inspect
from pathlib import Path

current_dir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
print(current_dir)
sys.path.insert(0, current_dir)

import argparse
import torch

torch_version = torch.__version__
torch.autograd.set_detect_anomaly(True)

import torch.optim as optim
from torch.optim.lr_scheduler import ReduceLROnPlateau
from tensorboardX import SummaryWriter

from models.model_RCNNOnly_combine_indeptPointnet_maskrcnnPose_discount import (
    RCNNOnly_combine,
)
from dataset_coco_pickle_eccv import my_collate, COCO2017ECCV
from utils.data_utils import make_data_loader
from utils.utils_misc import colored
from maskrcnn_rui.data.transforms import (
    build_transforms_maskrcnn,
    build_transforms_yannick,
)
import torch.distributed as dist
import numpy as np
from utils.model_utils import get_bins_combine
import utils.model_utils as model_utils
from utils.train_utils import f_pixels_to_mm
import utils.geo_utils as geo_utils
from tqdm import tqdm
from utils.logger import setup_logger, printer

from maskrcnn_rui.config import cfg
from maskrcnn_rui.utils.comm import get_rank
from utils.checkpointer import DetectronCheckpointer
import random
import numpy as np
from utils.model_utils import get_bins_combine
import utils.model_utils as model_utils

from utils.utils_misc import batch_dict_to_list_of_dicts
from utils.train_utils import f_pixels_to_mm
import utils.geo_utils as geo_utils

import skimage.io as io
from utils.utils_coco import fpix_to_fmm
from utils.vis_utils import blender_render, show_cam_bbox, show_box_kps
import string
from tqdm import tqdm
from utils.eval_save_utils_combine_RCNNONly import check_eval_COCO, check_save
from utils.utils_misc import green


seed = 140421
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)

parser = argparse.ArgumentParser(description="Rui's Scale Estimation Network Training")
# Training
parser.add_argument("--task_name", type=str, default="tmp", help="resume training")
parser.add_argument(
    "--workers", type=int, help="number of data loading workers", default=8
)
parser.add_argument(
    "--save_every_iter",
    type=int,
    default=0,
    help="set to 0 to save ONLY at the end of each epoch",
)
parser.add_argument("--summary_every_iter", type=int, default=100, help="")
parser.add_argument(
    "--nepoch", type=int, default=3, help="number of epochs to train for"
)
parser.add_argument(
    "--beta1", type=float, default=0.9, help="beta1 for adam. default=0.5"
)
parser.add_argument(
    "--not_val", action="store_true", help="Do not validate duruign training"
)
parser.add_argument("--not_vis", action="store_true", help="")
parser.add_argument("--not_vis_SUN360", action="store_true", help="")
parser.add_argument(
    "--save_every_epoch", type=int, default=1, help="save checkpoint every ? epoch"
)
parser.add_argument("--vis_every_epoch", type=int, default=5, help="vis every ? epoch")
# Model
parser.add_argument(
    "--accu_model",
    action="store_true",
    help="Use accurate model with theta instead of Derek's approx.",
)
parser.add_argument("--argmax_val", action="store_true", help="")
parser.add_argument(
    "--direct_camH",
    action="store_true",
    help="direct preidict one number for camera height ONLY, instead of predicting a distribution",
)
parser.add_argument(
    "--direct_v0",
    action="store_true",
    help="direct preidict one number for v0 ONLY, instead of predicting a distribution",
)
parser.add_argument(
    "--direct_fmm",
    action="store_true",
    help="direct preidict one number for fmm ONLY, instead of predicting a distribution",
)

# Pre-training
parser.add_argument(
    "--resume",
    type=str,
    help="resume training; can be full path (e.g. tmp/checkpoint0.pth.tar) or taskname (e.g. tmp)",
    default="NoCkpt",
)
parser.add_argument(
    "--feature_only",
    action="store_true",
    help="restore only features (remove all classifiers) from checkpoint",
)
parser.add_argument(
    "--reset_scheduler", action="store_true", help=""
)  # NOT working yet
parser.add_argument("--reset_lr", action="store_true", help="")  # NOT working yet

# Device
parser.add_argument("--cpu", action="store_true", help="Force training on CPU")
parser.add_argument("--local_rank", type=int, default=0)
parser.add_argument("--master_port", type=str, default="8914")

# DEBUG
parser.add_argument("--debug", action="store_true", help="Debug eval")
parser.add_argument("--debug_memory", action="store_true", help="Debug eval")

# Mask R-CNN
## Modules
parser.add_argument(
    "--train_cameraCls", action="store_true", help="Disable camera calibration network"
)
parser.add_argument("--train_roi_h", action="store_true", help="")
parser.add_argument(
    "--est_bbox",
    action="store_true",
    help="Enable estimating bboxes instead of using GT bboxes",
)
parser.add_argument(
    "--est_kps", action="store_true", help="Enable estimating keypoints"
)
parser.add_argument("--if_discount", action="store_true", help="")
parser.add_argument("--discount_from", type=str, default="pred")  # ('GT', 'pred')

## Losses
parser.add_argument(
    "--loss_last_layer", action="store_true", help="Using loss of last layer only"
)
parser.add_argument(
    "--loss_person_all_layers",
    action="store_true",
    help="Using loss of last layer only",
)
parser.add_argument(
    "--not_rcnn", action="store_true", help="Disable Mask R-CNN person height bbox head"
)
parser.add_argument("--no_kps_loss", action="store_true", help="")

## Archs
parser.add_argument("--pointnet_camH", action="store_true", help="")
parser.add_argument("--pointnet_camH_refine", action="store_true", help="")
parser.add_argument("--pointnet_personH_refine", action="store_true", help="")
parser.add_argument(
    "--pointnet_roi_feat_input", action="store_true", help=""
)  # NOT working yet
parser.add_argument(
    "--pointnet_roi_feat_input_person3", action="store_true", help=""
)  # NOT working yet
parser.add_argument(
    "--pointnet_fmm_refine", action="store_true", help=""
)  # NOT working yet
parser.add_argument(
    "--pointnet_v0_refine", action="store_true", help=""
)  # NOT working yet
parser.add_argument(
    "--not_pointnet_detach_input", action="store_true", help=""
)  # NOT working yet
parser.add_argument("--num_layers", type=int, default=3)
parser.add_argument("--fit_derek", action="store_true", help="")
## weights
parser.add_argument(
    "--weight_SUN360",
    type=float,
    default=10.0,
    help="weight for Yannick's losses. default=1.",
)
parser.add_argument(
    "--weight_kps",
    type=float,
    default=1e-3,
    help="weight for Yannick's losses. default=1.",
)

## debug
parser.add_argument("--zero_pitch", action="store_true", help="")  # NOT working yet

parser.add_argument(
    "--config-file",
    default="",
    metavar="FILE",
    help="path to config file",
    type=str,
)
parser.add_argument(
    "opts",
    help="Modify config options using the command-line",
    default=None,
    nargs=argparse.REMAINDER,
)

opt = parser.parse_args(
    ("--task_name tmp_eval --num_layers 3 --train_cameraCls --train_roi_h --pointnet_camH " + 
    "--pointnet_camH_refine --pointnet_personH_refine --accu_model --est_kps --est_bbox " + 
    "--loss_person_all_layers --config-file config/coco_config_small_synBN1108_kps.yaml  " + 
    "--weight_SUN360 10. "
    ).split()
)

print(opt)
opt.debug = True
opt.checkpoints_folder = "checkpoint"

config_file = opt.config_file
cfg.merge_from_file(config_file)
# manual override some options
cfg.merge_from_list(["MODEL.DEVICE", "cuda"])
cfg.merge_from_list(opt.opts)
cfg.freeze()

opt.cfg = cfg

# sys.path.insert(0, cfg.MODEL.POINTNET.PATH)
opt.rank = opt.local_rank

num_gpus = 1
opt.distributed = num_gpus > 1
device = "cuda"
opt.device = device


# Logger

In [None]:
# === SUMMARY WRITERS
summary_path = "./summary/" + opt.task_name
writer = SummaryWriter(summary_path)

# === LOGGING
logger = setup_logger(
    "logger:train", summary_path, get_rank(), filename="logger_maskrcn-style.txt"
)
logger.info(colored("==[config]== opt", "white", "on_blue"))
logger.info(opt)
logger.info(colored("==[config]== cfg", "white", "on_blue"))
logger.info(cfg)
logger.info(
    colored(
        "==[config]== Loaded configuration file {}".format(opt.config_file),
        "white",
        "on_blue",
    )
)
with open(opt.config_file, "r") as cf:
    config_str = "\n" + cf.read()
    logger.info(config_str)
printer = printer(get_rank(), debug=opt.debug)

# Model

In [None]:
# === MODEL & TRAINING
modules_not_build = []
if not opt.train_cameraCls:
    modules_not_build.append("classifier_heads")
if not opt.train_roi_h:
    modules_not_build.append("roi_h_heads")
if not opt.est_bbox and not opt.est_kps:
    modules_not_build.append("roi_bbox_heads")
sys.path.insert(0, "models/pointnet")
print(modules_not_build)
opt.debug = True
model = RCNNOnly_combine(
    opt, logger, printer, num_layers=opt.num_layers, modules_not_build=modules_not_build
)

model.to(device)
model.turn_on_all_params()

for name, param in model.named_parameters():
    printer.print(name, param.shape, param.requires_grad)
printer.print("ALL %d params" % len(list(model.named_parameters())))

optimizer = optim.Adam(
    model.parameters(), lr=cfg.SOLVER.BASE_LR, betas=(opt.beta1, 0.999), eps=1e-5
)
scheduler = ReduceLROnPlateau(optimizer, "min", factor=0.5, patience=100, cooldown=50)

In [None]:
# === CHECKPOINT
opt.checkpoints_path_task = os.path.join(opt.checkpoints_folder, opt.task_name)
save_to_disk = get_rank() == 0
checkpointer = DetectronCheckpointer(
    opt, model, optimizer, scheduler, opt.checkpoints_folder, opt.checkpoints_path_task, save_to_disk, logger=logger, if_print=False
)
tid_start = 0
epoch_start = 0
if opt.resume != 'NoCkpt':
    checkpoint_restored, _, _ = checkpointer.load(task_name=opt.resume)
    if 'iteration' in checkpoint_restored:
        tid_start = checkpoint_restored['iteration']
    if 'epoch' in checkpoint_restored:
        epoch_start = checkpoint_restored['epoch']
    print(checkpoint_restored.keys())
    logger.info(colored('Restoring from epoch %d - iter %d'%(epoch_start, tid_start), 'white', 'on_blue'))
model.print_net()


# Dataset

In [None]:
# === DATASET
train_trnfs_maskrcnn = build_transforms_maskrcnn(cfg, True)
eval_trnfs_maskrcnn = build_transforms_maskrcnn(cfg, False)
train_trnfs_yannick = build_transforms_yannick(cfg, True)
eval_trnfs_yannick = build_transforms_yannick(cfg, False)

ds_train_coco_vis = COCO2017ECCV(
    transforms_yannick=train_trnfs_yannick,
    transforms_maskrcnn=train_trnfs_maskrcnn,
    split="train",
    shuffle=False,
    logger=logger,
    opt=opt,
)  
ds_eval_coco_vis = COCO2017ECCV(
    transforms_yannick=eval_trnfs_yannick,
    transforms_maskrcnn=eval_trnfs_maskrcnn,
    split="val",
    shuffle=False,
    logger=logger,
    opt=opt,
)  

training_loader_coco_vis = make_data_loader(
    cfg,
    ds_train_coco_vis,
    is_train=True,
    is_distributed=False,
    start_iter=0,
    logger=logger,
    collate_fn=my_collate,
    batch_size_override=2,  # BN does not make sense when model.train() and batchsize==1!
)
eval_loader_coco_vis = make_data_loader(
    cfg,
    ds_eval_coco_vis,
    is_train=False,
    is_distributed=False,
    logger=logger,
    collate_fn=my_collate,
    batch_size_override=-1,
)

# Train Model

In [None]:
results_path = "train_results"
Path(results_path).mkdir(exist_ok=True)
task_name = opt.resume
task_name_appendix = "-predH"
task_name += task_name_appendix

write_folder = os.path.join(results_path, task_name)
for subfolder in ["", "png", "npy", "pickle", "results"]:
    Path(os.path.join(write_folder, subfolder)).mkdir(parents=True, exist_ok=True)
results_path_png = os.path.join(write_folder, "png")
results_path_results = os.path.join(write_folder, "results")

is_training = True
if_vis = False
if_debug = False
prepostfix = "trainSet-"

In [None]:
from train_batch_combine_RCNNOnly_v5_pose_multiCat import train_batch_combine
opt.zero_pitch = False
if_vis = False
if_blender = False
select_show = 0

## START TRAINING
best_loss = float("inf")
bins = get_bins_combine(device)
tid = 0

epoch = 0
epochs_evalued = []
epochs_saved = []
eval_loss = 0
cont = 0

model.train()

cpu_device = torch.device("cpu")
loss_func = torch.nn.L1Loss()
if opt.distributed:
    rank = dist.get_rank()
else:
    rank = 0

eval_loss_vt_list = []
eval_loss_person_list = []
vt_loss_allBoxes_dict = {}
vt_loss_allBoxes_dict_list = []

im_filename_list = []

test_list = []

vt_loss_all = []
if_print = False
if_debug = False
cfg = opt.cfg

num_plots = 0
pitch_abs_list = []
epochs = range(0, opt.nepoch)
for epoch in epochs:
    for i, (
        _,
        inputCOCO_Image_maskrcnnTransform_list,
        W_batch_array,
        H_batch_array,
        yc_batch,
        bboxes_batch_array,
        bboxes_length_batch_array,
        v0_batch,
        f_pixels_yannick_batch,
        im_filename,
        im_file,
        target_maskrcnnTransform_list,
        labels_list,
    ) in tqdm(enumerate(training_loader_coco_vis)):
        tid = i
        is_better = False
        input_dict = {
            "inputCOCO_Image_maskrcnnTransform_list": inputCOCO_Image_maskrcnnTransform_list,
            "W_batch_array": W_batch_array,
            "H_batch_array": H_batch_array,
            "yc_batch": yc_batch,
            "bboxes_batch_array": bboxes_batch_array,
            "bboxes_length_batch_array": bboxes_length_batch_array,
            "v0_batch": v0_batch,
            "f_pixels_yannick_batch": f_pixels_yannick_batch,
            "im_filename": im_filename,
            "im_file": im_file,
            "bins": bins,
            "target_maskrcnnTransform_list": target_maskrcnnTransform_list,
            "labels_list": labels_list,
        }
        bins = input_dict["bins"]
        return_dict, loss_dict = train_batch_combine(
                    input_dict,
                    model,
                    device,
                    opt,
                    is_training=True,
                    tid=i,
                    loss_func=loss_func,
                    rank=rank,
                    if_SUN360=False,
                    if_vis=False,
        )
        # dict_keys(['loss_kp', 'loss_bbox_cls', 'loss_bbox_reg', 'loss_vt', 'loss_person'])
        # dict_keys(
            # ['yc_est_batch', 'vt_loss_allBoxes_dict', 'loss_vt_list', 'loss_vt_layers_dict', 
            # 'all_person_hs', 'all_person_hs_layers', 'loss_all_person_h_list'])
        # Combine the losses into a single scalar
        a1, a2, a3, a4, a5 = 1.0, 0.05, 1.0, 10.0, 10.0
        # NOTE: I need to add the loss from the Calibration dataset here for a3
        total_loss = a1 * return_dict["loss_vt"] + a2 * return_dict["loss_person"] + a4 * return_dict["loss_bbox_cls"] + a5 * return_dict["loss_kp"]
        # Backpropagation
        optimizer.zero_grad()
        total_loss.backward()
        optimizer.step()
        if tid % opt.summary_every_iter == 0: 
            logger.info(
                f"Epoch {epoch} Iter {tid}: Loss VT = {loss_dict['loss_vt'].item():.4f} " + 
                f"Loss Person = {loss_dict['loss_person'].item():.4f}"
            )
        if opt.save_every_iter != 0 and tid % opt.save_every_iter == 0 and tid > 0:
            check_save(
                rank=rank,
                tid=tid,
                epoch_save=epoch,
                epoch_total=epoch,
                opt=opt,
                checkpointer=checkpointer,
                epochs_saved=epochs_saved,
                checkpoints_folder=opt.checkpoints_folder,
                logger=logger,
                is_better=is_better,
            )
        # After computing loss_dict and other stats for this tid/epoch
        if tid != 0 and (
            (opt.save_every_iter != 0 and tid % opt.save_every_iter == 0)
            or tid % (len(training_loader_coco_vis)-1) == 0
        ):
            is_better = check_eval_COCO(
                tid=tid,
                epoch=epoch,
                rank=rank,
                opt=opt,
                model=model,
                eval_loader=eval_loader_coco_vis,
                writer=writer,
                device=device,
                bins=bins,
                logger=logger,
                scheduler=scheduler,
                epochs_evalued=epochs_evalued,
            )

            check_save(
                rank=rank,
                tid=tid,
                epoch_save=epoch,
                epoch_total=epoch,
                opt=opt,
                checkpointer=checkpointer,
                epochs_saved=epochs_saved,
                checkpoints_folder=opt.checkpoints_folder,
                logger=logger,
                is_better=is_better,
            )

# Eval-RELEASE

In [None]:
results_path = "release_results"

Path(results_path).mkdir(exist_ok=True)
task_name = opt.resume
task_name_appendix = "-predH"
task_name += task_name_appendix

print(task_name)
write_folder = os.path.join(results_path, task_name)
for subfolder in ["", "png", "npy", "pickle", "results"]:
    Path(os.path.join(write_folder, subfolder)).mkdir(parents=True, exist_ok=True)
results_path_png = os.path.join(write_folder, "png")
results_path_results = os.path.join(write_folder, "results")

is_training = False
if_vis = True
if_blender = True
if_debug = False
prepostfix = "testSet-|-evalMode"

test_loader = eval_loader_coco_vis

In [None]:
import base64
opt.zero_pitch = False
select_show = 0
logger = setup_logger(
    "logger:train", summary_path, get_rank(), filename="logger_maskrcn-style.txt"
)
## START TRAINING
best_loss = float("inf")
bins = get_bins_combine(device)
tid = 0

epoch = 0
eval_loss = 0

model.eval()

cpu_device = torch.device("cpu")
loss_func = torch.nn.L1Loss()
if opt.distributed:
    rank = dist.get_rank()
else:
    rank = 0

eval_loss_vt_list = []
eval_loss_person_list = []
vt_loss_allBoxes_dict = {}
vt_loss_allBoxes_dict_list = []

im_filename_list = []

test_list = []

vt_loss_all = []

num_plots = 0
pitch_abs_list = []
with torch.no_grad():
    for i, (
        _,
        inputCOCO_Image_maskrcnnTransform_list,
        W_batch_array,
        H_batch_array,
        yc_batch,
        bboxes_batch_array,
        bboxes_length_batch_array,
        v0_batch,
        f_pixels_yannick_batch,
        im_filename,
        im_file,
        target_maskrcnnTransform_list,
        labels_list,
    ) in tqdm(enumerate(test_loader)):

        if select_show != -1 and i < select_show:
            continue
        tid = i

        input_dict = {
            "inputCOCO_Image_maskrcnnTransform_list": inputCOCO_Image_maskrcnnTransform_list,
            "W_batch_array": W_batch_array,
            "H_batch_array": H_batch_array,
            "yc_batch": yc_batch,
            "bboxes_batch_array": bboxes_batch_array,
            "bboxes_length_batch_array": bboxes_length_batch_array,
            "v0_batch": v0_batch,
            "f_pixels_yannick_batch": f_pixels_yannick_batch,
            "im_filename": im_filename,
            "im_file": im_file,
            "bins": bins,
            "target_maskrcnnTransform_list": target_maskrcnnTransform_list,
            "labels_list": labels_list,
        }
        if_print = is_training
        cfg = opt.cfg
        bins = input_dict["bins"]

        # ========= Rui's inputs
        inputCOCO_Image_maskrcnnTransform_list = input_dict[
            "inputCOCO_Image_maskrcnnTransform_list"
        ]
        (
            bboxes_batch,
            v0_batch_offline,
            f_pixels_yannick_batch_offline,
            W_batch,
            H_batch,
            yc_batch_offline,
        ) = (
            torch.from_numpy(input_dict["bboxes_batch_array"]).float().to(device),
            input_dict["v0_batch"].to(device),
            input_dict["f_pixels_yannick_batch"].to(device),
            torch.from_numpy(input_dict["W_batch_array"]).to(device),
            torch.from_numpy(input_dict["H_batch_array"]).to(device),
            input_dict["yc_batch"].to(device),
        )

        if not opt.not_rcnn:
            list_of_bbox_list_cpu = model_utils.bboxArray_to_bboxList(
                bboxes_batch,
                input_dict["bboxes_length_batch_array"],
                input_dict["W_batch_array"],
                input_dict["H_batch_array"],
            )
        else:
            list_of_bbox_list_cpu = []

        if if_vis:
            input_dict_show = {'H': input_dict['H_batch_array'], 'W': input_dict['W_batch_array']}
            if 'testSet' not in prepostfix:
                input_dict_show.update({'v0_cocoPredict': input_dict['v0_batch'].numpy()})
            input_dict_show['bbox_gt'] = []
            input_dict_show['bbox_est'] = []
            input_dict_show['bbox_fit'] = []
            input_dict_show['bbox_h'] = []
            input_dict_show['bbox_geo'] = []
            input_dict_show['bbox_loss'] = []


        list_of_oneLargeBbox_list_cpu = model_utils.oneLargeBboxList(
            input_dict["W_batch_array"], input_dict["H_batch_array"]
        )  # in original image size
        list_of_oneLargeBbox_list = [
            bbox_list_array.to(device)
            for bbox_list_array in list_of_oneLargeBbox_list_cpu
        ]

        input_dict_misc = {
            "bins": bins,
            "is_training": is_training,
            "H_batch": H_batch,
            "W_batch": W_batch,
            "bboxes_batch": bboxes_batch,
            "loss_func": loss_func,
            "cpu_device": cpu_device,
            "device": device,
            "tid": tid,
            "rank": rank,
            "data": "coco",
            "if_vis": if_vis,
        }
        output_RCNN = model(
            input_dict_misc=input_dict_misc,
            input_dict=input_dict,
            image_batch_list=inputCOCO_Image_maskrcnnTransform_list,
            list_of_bbox_list_cpu=list_of_bbox_list_cpu,
            list_of_oneLargeBbox_list=list_of_oneLargeBbox_list,
        )
        pitch_abs_degree = abs(output_RCNN["pitch_batch_est"][0].item()) / np.pi * 180.0
        pitch_abs_list.append(pitch_abs_degree)
        if "zero_pitch" in opt and opt.zero_pitch:
            output_RCNN["pitch_batch_est"] *= 0.0
            output_RCNN["output_pitch"] *= 0.0
            output_RCNN["v0_batch_from_pitch_vfov"] = (
                output_RCNN["v0_batch_from_pitch_vfov"] * 0.0 + H_batch[0] / 2.0
            )
            output_RCNN["v0_batch_est"] = output_RCNN["v0_batch_from_pitch_vfov"]

        output_horizon = output_RCNN["output_horizon"]
        output_pitch = output_RCNN["output_pitch"]
        output_vfov = output_RCNN["output_vfov"]
        output_yc_batch = output_RCNN["output_yc_batch"]
        f_pixels_yannick_batch_ori = f_pixels_yannick_batch_offline.clone()
        v0_batch_ori = v0_batch_offline.clone()

        v0_batch_est = output_RCNN["v0_batch_est"]
        v0_batch_from_pitch_vfov = output_RCNN["v0_batch_from_pitch_vfov"]

        vfov_estim = output_RCNN["vfov_estim"]
        f_pixels_yannick_batch_est = output_RCNN["f_pixels_batch_est"]

        pitch_batch_est = output_RCNN["pitch_batch_est"]
        pitch_estim_yannick = output_RCNN["pitch_estim_yannick"]

        yc_est_batch = output_RCNN["yc_est_batch"]

        person_h_list = output_RCNN["person_h_list"]
        all_person_hs = output_RCNN["all_person_hs"]

        reduce_method = output_RCNN["reduce_method"]

        if tid % opt.summary_every_iter == 0 and if_print:
            f_mm_array_est = f_pixels_to_mm(f_pixels_yannick_batch_est, input_dict)

        ## Losses for bbox fitting
        vt_loss_sample_batch_list = []
        vt_loss_allBoxes_dict_cpu = {}
        vt_error_fit_allBoxes_dict_cpu = {}
        camH_fit_batch = []

        for idx, bboxes_length in enumerate(input_dict["bboxes_length_batch_array"]):
            bboxes = bboxes_batch[idx][:bboxes_length]  # [N, 4]
            W = W_batch[idx]
            H = H_batch[idx]
            vc = H / 2.0
            v0_est = v0_batch_est[idx]
            v0_ori = v0_batch_ori[idx]  # [top H, bottom 0]
            pitch_est = pitch_batch_est[idx]
            f_pixels_yannick_est = f_pixels_yannick_batch_est[idx]
            inv_f2 = 1.0 / (f_pixels_yannick_est * f_pixels_yannick_est)
            yc_est = yc_est_batch[idx]

            H_np = H_batch[idx].cpu().numpy()
            W_np = W_batch[idx].cpu().numpy()

            if opt.not_rcnn:
                h_human_s = (
                    torch.from_numpy(
                        np.asarray([1.75] * bboxes.shape[0], dtype=np.float32)
                    )
                    .float()
                    .to(device)
                )
            else:
                h_human_s = (
                    person_h_list[idx] * output_RCNN["straighten_ratios_list"][idx]
                )

            vt_loss_sample_list = []
            vt_error_sample_fit_list = []

            camH_fit_list = []
            for bbox_idx, bbox in enumerate(bboxes):
                y_person_fit = 1.75
                camH_fit_bbox = geo_utils.fit_camH(
                    bbox.cpu(),
                    H.cpu(),
                    v0_est.cpu(),
                    vc.cpu(),
                    f_pixels_yannick_est.cpu(),
                    y_person_fit,
                )
                camH_fit_list.append(camH_fit_bbox.detach().numpy())
            camH_fit = np.median(np.array(camH_fit_list))
            camH_fit_batch.append(camH_fit)

            bbox_gt_sample = []
            bbox_est_sample = []
            bbox_fit_sample = []
            bbox_h_sample = []
            bbox_geo_sample = []
            bbox_loss_sample = []
            for bbox_idx, (bbox, y_person) in enumerate(zip(bboxes, h_human_s)):
                vb = H - (bbox[1] + bbox[3])  # [top H bottom 0]
                vt_gt = H - bbox[1]  # [top H bottom 0]

                ## Fit y_person
                geo_model_input_dict = {
                    "yc_est": yc_est,
                    "vb": vb,
                    "y_person": y_person,
                    "v0": v0_est,
                    "vc": vc,
                    "f_pixels_yannick": f_pixels_yannick_est,
                    "pitch_est": pitch_est,
                }
                negative_z = False
                if opt.accu_model:
                    vt_camEst, z, negative_z = model_utils.accu_model_helanyi(
                        geo_model_input_dict, if_debug=False
                    )  # [top H bottom 0]
                    geo_model_input_dict_rui = {
                        "yc_est": yc_est,
                        "vb": vb,
                        "y_person": y_person * torch.cos(pitch_est),
                        "v0": v0_est,
                        "vc": vc,
                        "f_pixels_yannick": f_pixels_yannick_est,
                        "pitch_est": pitch_est,
                    }
                    vt_camEst_rui, z_rui, negative_z_rui = model_utils.accu_model(
                        geo_model_input_dict_rui, if_debug=False
                    )  # [top H bottom 0]
                else:
                    vt_camEst = model_utils.approx_model(geo_model_input_dict)

                vt_loss = loss_func(vt_gt, vt_camEst.reshape([])) / bbox[3]
                vt_loss = torch.clamp(vt_loss, 0.0, opt.cfg.MODEL.LOSS.VT_LOSS_CLAMP)
                vt_loss = torch.where(
                    torch.isnan(vt_loss), torch.zeros_like(vt_loss), vt_loss
                )
                vt_loss_sample_list.append(vt_loss)
                vt_loss_allBoxes_dict_cpu.update(
                    {
                        "bbox_vt_loss_tid%04d_rank%d_%02d-%02d"
                        % (tid, rank, idx, bbox_idx): vt_loss.to(cpu_device)
                    }
                )

                # Fitting 现场: getting vt
                y_person_fit = 1.75
                vt_camFit = geo_utils.fit_vt(
                    camH_fit,
                    vb,
                    v0_est,
                    vc,
                    y_person_fit,
                    1.0 / (f_pixels_yannick_est * f_pixels_yannick_est),
                )
                vt_error_fit = loss_func(vt_gt, vt_camFit) / bbox[3]

                vt_error_fit_allBoxes_dict_cpu.update(
                    {
                        "bbox_vt_error_fit_tid%04d_rank%d_%02d-%02d"
                        % (tid, rank, idx, bbox_idx): vt_error_fit.to(cpu_device)
                    }
                )

                vt_error_sample_fit_list.append(
                    vt_error_fit.detach().cpu().numpy().item()
                )

                if if_vis:
                    bbox_np = bbox.cpu().numpy()
                    vt_camEst_np = vt_camEst.detach().cpu().numpy()
                    vt_camFit_np = vt_camFit.detach().cpu().numpy()
                    bbox_gt_sample.append(
                        [bbox_np[0], bbox_np[1], bbox_np[2], bbox_np[3]]
                    )  # [x, y (top), w, h]
                    bbox_est_sample.append(
                        [
                            bbox_np[0],
                            H_np - vt_camEst_np,
                            bbox_np[2],
                            bbox_np[1] + bbox_np[3] - (H_np - vt_camEst_np),
                        ]
                    )
                    bbox_h_sample.append(y_person.cpu().detach().numpy())
                    bbox_geo_sample.append(geo_model_input_dict)
                    bbox_loss_sample.append(vt_loss.item())

            if if_vis:
                input_dict_show["bbox_gt"].append(bbox_gt_sample)
                input_dict_show["bbox_est"].append(bbox_est_sample)
                input_dict_show["bbox_fit"].append(bbox_fit_sample)
                input_dict_show["bbox_h"].append(bbox_h_sample)
                input_dict_show["bbox_geo"].append(bbox_geo_sample)
                input_dict_show["bbox_loss"].append(bbox_loss_sample)

            vt_loss_sample = torch.mean(torch.stack(vt_loss_sample_list))
            vt_loss_sample_batch_list.append(vt_loss_sample)

        vt_loss_batch = torch.stack(vt_loss_sample_batch_list)
        loss_vt = torch.mean(vt_loss_batch)

        return_dict = {
            "vt_loss_batch": vt_loss_batch,
            "vt_loss_allBoxes_dict": vt_loss_allBoxes_dict_cpu,
            "vt_error_fit_allBoxes_dict": vt_error_fit_allBoxes_dict_cpu,
            "yc_est_batch": yc_est_batch,
            "yc_batch_offline": yc_batch_offline,
            "yc_fit_batch": np.array(camH_fit_batch),
        }
        vt_loss_allBoxes_dict_list.append(vt_loss_allBoxes_dict_cpu)
        if tid % opt.summary_every_iter == 0 and if_print:
            return_dict.update({"f_mm_batch": f_mm_array_est.reshape(-1, 1)})

        if "loss_vt_list" in output_RCNN:
            loss_vt_list = output_RCNN["loss_vt_list"]
            assert len(loss_vt_list) != 0
        else:
            loss_vt_list = []
        loss_vt_list.append(loss_vt)
        return_dict.update({"loss_vt_list": loss_vt_list})
        loss_dict = {
            "loss_vt": sum(loss_vt_list) / len(loss_vt_list)
        }  # mean of layers; for optimization and scheduler

        if opt.pointnet_camH_refine:
            loss_vt_layers_dict = {}
            for loss_idx, loss in enumerate(loss_vt_list):
                loss_vt_layers_dict[
                    "loss_vt_layer_%d" % (loss_idx - len(loss_vt_list))
                ] = loss

            return_dict.update({"loss_vt_layers_dict": loss_vt_layers_dict})

        if not opt.not_rcnn:
            return_dict.update({"all_person_hs": all_person_hs})
            loss_all_person_h_list = output_RCNN["loss_all_person_h_list"]
            return_dict.update({"loss_all_person_h_list": loss_all_person_h_list})
            loss_dict.update(
                {
                    "loss_person": sum(loss_all_person_h_list)
                    / len(loss_all_person_h_list)
                }
            )  # mean of layers; for optimization and scheduler

        # ========== Some vis
        input_dict_show["W_batch_array"] = input_dict["W_batch_array"]
        input_dict_show["H_batch_array"] = input_dict["H_batch_array"]
        if if_vis:
            if opt.est_kps:
                input_dict_show["predictions"] = output_RCNN["predictions"]
                input_dict_show["target_maskrcnnTransform_list"] = input_dict[
                    "target_maskrcnnTransform_list"
                ]
            #             input_dict_show['v0_batch_predict'] = v0_batch_predict.detach().cpu().numpy()  # (H = top of the image, 0 = bottom of the image)
            input_dict_show["v0_batch_from_pitch_vfov"] = (
                v0_batch_from_pitch_vfov.detach().cpu().numpy()
            )
            input_dict_show["v0_batch_est"] = v0_batch_est.detach().cpu().numpy()
            if "v0_batch_est_0" in output_RCNN:
                input_dict_show["v0_batch_est_0"] = (
                    output_RCNN["v0_batch_est_0"].detach().cpu().numpy()
                )
            f_pixels_yannick_single_est = (
                f_pixels_yannick_batch_est.detach().cpu().numpy()
            )
            f_pixels_yannick_single_est_mm = [
                fpix_to_fmm(f_pixels_yannick_single_est_0, H_np, W_np)
                for f_pixels_yannick_single_est_0 in f_pixels_yannick_single_est
            ]
            f_pixels_yannick_single_ori = (
                f_pixels_yannick_batch_ori.detach().cpu().numpy()
            )
            f_pixels_yannick_single_ori_mm = [
                fpix_to_fmm(f_pixels_yannick_single_ori_0, H_np, W_np)
                for f_pixels_yannick_single_ori_0 in f_pixels_yannick_single_ori
            ]
            input_dict_show.update(
                {
                    "yc_fit": yc_batch_offline.detach().cpu().numpy(),
                    "yc_est": yc_est_batch.detach().cpu().numpy(),
                }
            )

            if len(output_RCNN["yc_est_batch_np_list"]) > 1:  # more than one layers
                input_dict_show.update(
                    {"yc_est_list": output_RCNN["yc_est_batch_np_list"]}
                )
            if len(output_RCNN["person_hs_est_np_list"]) > 1:
                input_dict_show.update(
                    {"person_hs_est_list": output_RCNN["person_hs_est_np_list"]}
                )
            if (
                "vt_camEst_N_delta_np_list" in output_RCNN
                and len(output_RCNN["vt_camEst_N_delta_np_list"]) > 1
            ):
                input_dict_show.update(
                    {
                        "vt_camEst_N_delta_est_list": output_RCNN[
                            "vt_camEst_N_delta_np_list"
                        ]
                    }
                )
            if (
                len(output_RCNN["f_pixels_est_batch_np_list"]) > 1
            ):  # more than one layers
                input_dict_show.update(
                    {
                        "f_pixels_est_mm_list": [
                            fpix_to_fmm(f_pixels_est_0, H_np, W_np)
                            for f_pixels_est_0 in output_RCNN[
                                "f_pixels_est_batch_np_list"
                            ]
                        ]
                    }
                )
            if len(output_RCNN["v0_01_est_batch_np_list"]) > 1:  # more than one layers
                input_dict_show.update(
                    {"v0_est_list": output_RCNN["v0_01_est_batch_np_list"]}
                )

            input_dict_show.update(
                {
                    "f_est_px": f_pixels_yannick_single_est,
                    "f_est_mm": f_pixels_yannick_single_est_mm,
                }
            )
            input_dict_show.update(
                {
                    "f_cocoPredict": f_pixels_yannick_single_ori,
                    "f_cocoPredict_mm": f_pixels_yannick_single_ori_mm,
                }
            )
            input_dict_show.update(
                {
                    "pitch_est_angle": pitch_batch_est.detach().cpu().numpy()
                    / np.pi
                    * 180.0
                }
            )

            input_dict_show["im_path"] = list(input_dict["im_file"])
            input_dict_show["im_filename"] = list(input_dict["im_filename"])
            num_samples = len(input_dict["im_file"])
            input_dict_show["output_horizon_COCO"] = (
                output_RCNN["output_horizon"].detach().cpu().numpy()
            )
            input_dict_show["horizon_bins"] = [
                bins["horizon_bins_centers_torch"].cpu().numpy()
            ] * num_samples

            if not opt.direct_camH:
                input_dict_show["output_camH_COCO"] = (
                    output_RCNN["output_yc_batch"].detach().cpu().numpy()
                )
                input_dict_show["camH_bins"] = [
                    bins["yc_bins_centers_torch"].cpu().numpy()
                ] * num_samples

            input_dict_show["tid"] = [tid] * num_samples
            input_dict_show["task_name"] = [opt.task_name] * num_samples
            input_dict_show["num_samples"] = num_samples
            input_dict_show["reduce_method"] = [reduce_method] * num_samples
            input_dict_show.update(
                {
                    "vfov_est": vfov_estim.detach().cpu().numpy(),
                    "pitch_est_yannick": pitch_estim_yannick.detach().cpu().numpy(),
                }
            )

            if input_dict_show["num_samples"] > 0:
                input_dict_show_list = batch_dict_to_list_of_dicts(input_dict_show)
                input_dict_show = input_dict_show_list[0]

        prefix, postfix = prepostfix.split("|")

        save_path_blender = results_path_png
        save_path_ours = results_path_png
        prefix = "coco_" + prefix
        postfix += "_reproj"

        if if_vis:
            show_cam_bbox(
                io.imread(input_dict_show["im_path"]),
                input_dict_show,
                figzoom=1.5,
                if_show=True,
                if_save=True if save_path_ours else False,
                if_pause=False,
                save_path=save_path_ours,
                save_name=prefix + "%06d" % (tid) + postfix + "_reproj",
            )
            if opt.est_kps:
                show_box_kps(
                    opt,
                    model,
                    io.imread(input_dict_show["im_path"]),
                    input_dict_show,
                    if_show=True,
                    if_pause=False,
                    save_path="",
                    save_name=prefix + "tid%d" % (tid) + postfix,
                )

            if if_blender:
                tmp_code = base64.b32encode(im_filename[0].encode()).decode()
                blender_render(
                    input_dict_show,
                    output_RCNN,
                    im_file,
                    tmp_code=tmp_code,
                    render_type="chair",
                    pick=-1,
                    grid=False,
                    current_dir=os.path.join(os.getcwd()),
                    save_name=tmp_code,
                )
        num_plots += 1
        if num_plots >= 1:
            break