In [None]:
%load_ext autoreload
%autoreload 2

import sys
IN_COLAB = "google.colab" in sys.modules

if IN_COLAB:
    %pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu116
    %pip install pyyaml==5.1 filterpy imagesize tensorboard moviepy
    %pip install --upgrade protobuf

    # Install detectron2 that matches the above pytorch version
    # See https://detectron2.readthedocs.io/tutorials/install.html for instructions
    %pip install detectron2 -f https://dl.fbaipublicfiles.com/detectron2/wheels/$CUDA_VERSION/torch$TORCH_VERSION/index.html

    exit(0)  # After installation, you may need to "restart runtime" in Colab. This line can also restart runtime

In [None]:
import torch

TORCH_VERSION = ".".join(torch.__version__.split(".")[:2])
CUDA_VERSION = torch.__version__.split("+")[-1]
print("torch: ", TORCH_VERSION, "; cuda: ", CUDA_VERSION)


gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

In [None]:
%load_ext autoreload
%autoreload 2

import sys
IN_COLAB = "google.colab" in sys.modules

# Setup detectron2 logger
import detectron2
from detectron2.utils.logger import setup_logger
setup_logger()

import copy
import datetime
import functools
import io
import json
import os
import random
import time
from collections import namedtuple

import cv2
import moviepy as mpy
import numpy as np
import torch
import tqdm
from filterpy.kalman import KalmanFilter
from matplotlib import pyplot as plt
from scipy.optimize import linear_sum_assignment

from detectron2 import model_zoo
from detectron2.engine import DefaultPredictor
from detectron2.config import get_cfg
from detectron2.utils.visualizer import Visualizer, ColorMode
from detectron2.data import MetadataCatalog, DatasetCatalog
from detectron2.engine import DefaultTrainer
from detectron2.export.flatten import TracingAdapter
from detectron2_backbone import *


if IN_COLAB:
  from google.colab.patches import cv2_imshow
else:
  def cv2_imshow(img):
    plt.figure(figsize=(24, 16))
    plt.imshow(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))

# WARNING: adjust maxsize if out of memory
@functools.lru_cache(maxsize=400)
def cv2_imread(img_path):
  return cv2.imread(img_path)

In [35]:
# Setup colab environment

if IN_COLAB:
    ## Connect to GDrive, for ease of backup
    from google.colab import drive

    drive.mount("/content/drive")
    os.makedirs("/content/drive/MyDrive/OvercookedColab/", exist_ok=True)

    # Download dataset
    !wget https://www.dropbox.com/s/cu8zkb5bf2slqso/Overcooked2_1-1.zip?dl=1 -O Overcooked2_1-1.zip
    !unzip Overcooked2_1-1.zip
    !wget https://www.dropbox.com/s/8x5c004whvk646e/detection_dataset.json?dl=1 -O Overcooked2_1-1/detection_dataset.json

In [36]:
## Load dataset labels and split into train/val subsets

from pathlib import Path

from overcooked_ai.dataset_types import DetectionDataset

if IN_COLAB:
    SOURCE_DIR = Path("Overcooked2_1-1")
else:
    SOURCE_DIR = Path("/home/mimic/Overcooked2_1-1_jpeg")
rng_seed = 1337
training_subset_ratio = 0.75
dataset_json = SOURCE_DIR / "detection_dataset.mar2025.json"

# Load dataset
dataset = DetectionDataset.load_from_json(dataset_json)

# Update file_name to absolute paths
for entry in dataset.entries:
    entry.file_name = str(SOURCE_DIR / entry.file_name)

# Remove all segmentation annotations
for entry in dataset.entries:
    for anno in entry.annotations:
        anno.segmentation_rle_counts = None

dataset_dict = dataset.to_dict()["dataset_dict"]
thing_classes = dataset.thing_classes
thing_colors = [(255, 255, 255) for _ in range(len(thing_classes))]

# Split into train and val
np.random.seed(rng_seed)
np.random.shuffle(dataset_dict)
num_training_entries = int(len(dataset_dict) * training_subset_ratio)
train_dataset_dict = dataset_dict[:num_training_entries]
val_dataset_dict = dataset_dict[num_training_entries:]

# Register datasets into detectron2
for d in ["train", "val"]:
    dataset_name = "overcooked_" + d
    if dataset_name in DatasetCatalog:
        DatasetCatalog.remove(dataset_name)
        MetadataCatalog.remove(dataset_name)
    DatasetCatalog.register(dataset_name, lambda d=d: train_dataset_dict if d == "train" else val_dataset_dict)
    MetadataCatalog.get(dataset_name).set(thing_classes=thing_classes, thing_colors=thing_colors)
overcooked_metadata = MetadataCatalog.get("overcooked_train")

In [None]:
## DEBUG: Visualize sample images and labels in training dataset

for d in random.sample(train_dataset_dict, 2):
    img = cv2_imread(d["file_name"])
    v = Visualizer(img[:, :, ::-1], metadata=overcooked_metadata, scale=0.5, instance_mode=ColorMode.SEGMENTATION)
    v._default_font_size = 40
    out = v.draw_dataset_dict(d)
    plt.figure(figsize=(20, 14))
    cv2_imshow(out.get_image()[:, :, ::-1])

In [None]:
# Train

cfg = get_cfg()

cfg.merge_from_file(model_zoo.get_config_file("COCO-Detection/faster_rcnn_R_50_FPN_1x.yaml"))
# TODO: try training ViT-based

# NOTE: this model needs gt_masks to be present in every single annotation in the dataset
# cfg.merge_from_file(model_zoo.get_config_file("COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml"))

cfg.DATASETS.TRAIN = ("overcooked_train",)
cfg.DATASETS.TEST = ("overcooked_val",)
cfg.INPUT.RANDOM_FLIP = "none"
cfg.MODEL.WEIGHTS = ""  #start with random initial weights
cfg.SOLVER.BASE_LR = 0.0005
cfg.SOLVER.MAX_ITER = 30000
cfg.SOLVER.GAMMA = 0.5
cfg.SOLVER.WARMUP_FACTOR = 1.0 / 1000
cfg.SOLVER.WARMUP_ITERS = 1000
cfg.SOLVER.STEPS = [3000, 10000, 16000, 24000]  # update LR as GAMMA*LR at STEPS[i]
cfg.SOLVER.CHECKPOINT_PERIOD = 2000
cfg.MODEL.ANCHOR_GENERATOR.ASPECT_RATIOS = [[1.0,]]
cfg.MODEL.ANCHOR_GENERATOR.OFFSET = 0.5
cfg.MODEL.ROI_HEADS.BATCH_SIZE_PER_IMAGE = 512
cfg.MODEL.ROI_HEADS.IOU_THRESHOLDS = [0.75]
cfg.MODEL.ROI_HEADS.NUM_CLASSES = len(thing_classes)
cfg.TEST.DETECTIONS_PER_IMAGE = 500
cfg.MODEL.ROI_HEADS.NMS_THRESH_TEST = 0.75
cfg.SEED = rng_seed
if IN_COLAB:
    cfg.OUTPUT_DIR = os.path.join("overcooked_models", datetime.datetime.now().strftime("%Y%m%d_%H%M%S"))
else:
    cfg.OUTPUT_DIR = os.path.join("/home/mimic/objdet/overcooked_models", datetime.datetime.now().strftime("%Y%m%d_%H%M%S"))
# (see https://detectron2.readthedocs.io/tutorials/datasets.html#update-the-config-for-new-datasets)

cfg.INPUT.MIN_SIZE_TRAIN = (640, 672, 704, 736, 768, 800)
cfg.INPUT.MIN_SIZE_TRAIN_SAMPLING = "choice"
cfg.INPUT.MAX_SIZE_TRAIN = 1080
cfg.INPUT.MIN_SIZE_TEST = 800
cfg.INPUT.MAX_SIZE_TEST = 1080
cfg.INPUT.MASK_FORMAT = "bitmask"
cfg.INPUT.RANDOM_FLIP = "none"

# Free Colab settings
cfg.DATALOADER.NUM_WORKERS = 2
cfg.SOLVER.IMS_PER_BATCH = 2

# Tuning for better convergence
cfg.SOLVER.MAX_ITER = 200000
cfg.SOLVER.STEPS = [3000, 6000, 10000, 14000, 20000, 25000, 30000, 50000, 75000]  # update LR as GAMMA*LR at STEPS[i]
cfg.SOLVER.GAMMA = 0.75

# TODO: try training at higher LR
cfg.SOLVER.MAX_ITER = 100000
cfg.SOLVER.STEPS = [5000, 10000, 15000, 25000, 50000]  # update LR as GAMMA*LR at STEPS[i]

# Colab Pro settings  # NOTE: these almost double training time, but doesn't seem to improve convergence rate but rather actually slower
# cfg.DATALOADER.NUM_WORKERS = 2
# cfg.SOLVER.IMS_PER_BATCH = 4

os.makedirs(cfg.OUTPUT_DIR, exist_ok=True)
with open(os.path.join(cfg.OUTPUT_DIR, "model.yaml"), "w") as fh:
    fh.write(cfg.dump())

trainer = DefaultTrainer(cfg)
trainer.resume_or_load(resume=False)
trainer.train()

cfg.MODEL.WEIGHTS = os.path.join(cfg.OUTPUT_DIR, "model_final.pth")
with open(os.path.join(cfg.OUTPUT_DIR, "model.yaml"), "w") as fh:
    fh.write(cfg.dump())

In [None]:
# Resume previous training

cfg = get_cfg()

cfg.merge_from_file("/home/mimic/objdet/overcooked_models/20250323_180506/model.yaml")

trainer = DefaultTrainer(cfg)
trainer.resume_or_load(resume=True)
trainer.train()

cfg.MODEL.WEIGHTS = os.path.join(cfg.OUTPUT_DIR, "model_final.pth")
with open(os.path.join(cfg.OUTPUT_DIR, "model.yaml"), "w") as fh:
    fh.write(cfg.dump())

In [None]:
dlt = DifferentiableDLT()

T_grid_xys = torch.tensor(corner_grid_xys, dtype=torch.float64).unsqueeze(0)
T_frame_xys = torch.tensor(corner_frame_xys, dtype=torch.float64).unsqueeze(0)

T_H_frame_grid = dlt(source_xys=T_grid_xys, target_xys=T_frame_xys)
print("T_H_frame_grid\n", T_H_frame_grid[0])

print("diff w/ gt:")
print(np.abs(T_H_frame_grid.cpu().detach().numpy() - H_frame_grid))

T_pred_frame_xys = batch_apply_homography(T_H_frame_grid, T_grid_xys)
print("T_pred_frame_xys\n", T_pred_frame_xys)
print("diff w/ gt:\n", np.abs(T_pred_frame_xys.cpu().detach().numpy() - corner_frame_xys))

In [None]:
## Back up all content to Google Drive
if IN_COLAB:
    !rm overcooked_models/*/model_0*.pth
    !du -h overcooked_models
    !cp -r overcooked_models/* /content/drive/MyDrive/OvercookedColab/

In [None]:
# Save trained model to path
if IN_COLAB:
    from google.colab import files
    # files.download(os.path.join(cfg.OUTPUT_DIR, "model_0014999.pth"))
    files.download(os.path.join(cfg.OUTPUT_DIR, "model_final.pth"))

In [None]:
# Upload previous checkpoint
if IN_COLAB:
    from google.colab import files
    content_dict = files.upload()
    print(list(content_dict.keys()))

In [None]:
# Look at training curves in tensorboard:
%load_ext tensorboard
%tensorboard --logdir overcooked_models/

In [47]:
# Load existing model
cfg = get_cfg()
cfg.merge_from_file(os.path.join("/home/mimic/objdet/overcooked_models", "20250323_180506", "model.yaml"))


In [None]:
# Inference should use the config with parameters that are used in training
eval_cfg = copy.deepcopy(cfg)
eval_cfg.DATASETS.TEST = ("overcooked_val",)
eval_cfg.MODEL.WEIGHTS = os.path.join(eval_cfg.OUTPUT_DIR, "model_final.pth")
eval_cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = 0.15
predictor = DefaultPredictor(eval_cfg)

In [None]:
for d in random.sample(val_dataset_dict, 2):    
    im = cv2_imread(d["file_name"])
    outputs = predictor(im)  # format is documented at https://detectron2.readthedocs.io/tutorials/models.html#model-output-format
    v = Visualizer(im[:, :, ::-1],
                   metadata=overcooked_metadata,
                   scale=0.5,
                   instance_mode=ColorMode.SEGMENTATION 
    )
    v._default_font_size = 27
    output_instances = outputs["instances"].to("cpu")
    out = v.draw_instance_predictions(output_instances)
    plt.figure(figsize=(20, 14))
    cv2_imshow(out.get_image()[:, :, ::-1])

In [None]:
img_scale = 0.5
eval_mp4_path = os.path.join(eval_cfg.OUTPUT_DIR, "eval.mp4")
im = cv2_imread(os.path.join(SOURCE_DIR, dataset.entries[0].file_name))
video_out = cv2.VideoWriter(eval_mp4_path, cv2.VideoWriter_fourcc("M","P","4","V"), 5, (int(im.shape[1] * img_scale), int(im.shape[0] * img_scale)))

for entry in tqdm.tqdm(dataset.entries):
    img_path = os.path.join(SOURCE_DIR, entry.file_name)
    im = cv2_imread(img_path)
    outputs = predictor(im)
    v = Visualizer(im[:, :, ::-1],
                   metadata=overcooked_metadata,
                   scale=img_scale,
                   instance_mode=ColorMode.SEGMENTATION
    )
    v._default_font_size = 27
    output_instances = outputs["instances"].to("cpu")
    # output_instances = prune_annotations(output_instances, max_iou_overlap=0.3)  # TODO: fix this (pbly need to convert to BBoxAnnotation)
    out = v.draw_instance_predictions(output_instances)
    img_overlay = out.get_image()[:, :, ::-1]
    video_out.write(img_overlay)
video_out.release()
print(f"\nWrote to {eval_mp4_path}")

In [None]:
if IN_COLAB:
    from google.colab import files
    files.download(eval_mp4_path)

In [None]:
from detectron2.evaluation import COCOEvaluator, inference_on_dataset
from detectron2.data import build_detection_test_loader
import pickle as pkl

evaluator = COCOEvaluator("overcooked_val", output_dir=os.path.join(eval_cfg.OUTPUT_DIR, "eval"))
val_loader = build_detection_test_loader(eval_cfg, "overcooked_val")
eval_results = inference_on_dataset(predictor.model, val_loader, evaluator)
print(eval_results)
with open(os.path.join(eval_cfg.OUTPUT_DIR, "eval_results.pkl"), "wb") as fh:
  pkl.dump(eval_results, fh, protocol=pkl.HIGHEST_PROTOCOL)

In [None]:
# Load dataset without labels and insert predictions

eval_cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = 0.15
predictor = DefaultPredictor(eval_cfg)

foreground_categories = [
 'chef',
 'tuna',
 'plate_cut_shrimp',
 'plate_cut_tuna',
 'shrimp',
 'cut_shrimp',
 'cut_tuna',
 'icon_tuna',
 'icon_shrimp',
]
foreground_category_ids = set(dataset.thing_classes.index(c) for c in foreground_categories)

total_pred_annos = 0
for entry_id, entry in enumerate(tqdm.tqdm(dataset.entries)):
  # Wipe existing labels
  entry.annotations = []

  # Predict on image, then store predicted annotations
  im = cv2_imread(os.path.join(SOURCE_DIR, entry.file_name))
  outputs = predictor(im)
  preds = outputs["instances"].to("cpu")
  pred_bboxes = preds.pred_boxes.tensor.numpy()
  pred_category_ids = preds.pred_classes.numpy()
  scores = preds.scores.numpy()
  for bbox, category_id, score in zip(pred_bboxes, pred_category_ids, scores):
    # if category_id not in foreground_category_ids:
    #   continue
    anno = BBoxAnnotation(bbox[0], bbox[1], bbox[2], bbox[3], category_id, score=score)
    entry.annotations.append(anno)
    total_pred_annos += 1

pred_dataset_json = os.path.join(eval_cfg.OUTPUT_DIR, "detection_dataset.pred.json")
dataset.saveToJson(pred_dataset_json)
print(f"Wrote {total_pred_annos} total annotations to {pred_dataset_json}")

In [None]:
# Remove overlapping annotations

ds = DetectionDataset.loadFromJson(os.path.join(eval_cfg.OUTPUT_DIR, "detection_dataset.pred.json"))

min_score = 0.175
default_max_iou_overlap = 0.3
category_max_iou_overlap = {"chef": 0.35}
all_annos_max_iou_overlap = 0.8

combo_categories_list = (
    ("ctop", "cut_shrimp", "cut_tuna", "plate_cut_shrimp", "plate_cut_tuna"),
)

pruned_annos = 0
kept_annos = 0
for entry_idx, entry in enumerate(ds.entries):
    # Count original annotations, then in the end subtract pruned annotations
    kept_annos += len(entry.annotations)
    seen_categories = set()

    # Remove annotations with low scores
    other_annos = []
    for anno in entry.annotations:
        if anno.score is not None and anno.score < min_score:
            continue
        other_annos.append(anno)
    pruned_annos += len(entry.annotations) - len(other_annos)
    entry.annotations = other_annos

    # Group all annotations in each combo_categories tuple, and prune
    for combo_categories in combo_categories_list:
        for category in combo_categories:
            assert(category not in seen_categories)
            seen_categories.add(category)
        target_annos = []
        other_annos = []
        for anno in entry.annotations:
            if ds.thing_classes[anno.category_id] in combo_categories:
                target_annos.append(anno)
            else:
                other_annos.append(anno)

        max_iou_overlap = default_max_iou_overlap
        for category in combo_categories:
            if category in category_max_iou_overlap:
                max_iou_overlap = category_max_iou_overlap[category]
                break
        kept_target_annos = prune_annotations(target_annos, max_iou_overlap)
        entry.annotations = other_annos + kept_target_annos
        pruned_annos += len(target_annos) - len(kept_target_annos)
    
    # Iterate over all other categories
    for category in ds.thing_classes:
        if category in seen_categories:
            continue
        seen_categories.add(category)
        target_category_id = ds.thing_classes.index(category)
        target_annos = []
        other_annos = []
        for anno in entry.annotations:
            if anno.category_id == target_category_id:
                target_annos.append(anno)
            else:
                other_annos.append(anno)

        max_iou_overlap = category_max_iou_overlap[category] if category in category_max_iou_overlap else default_max_iou_overlap
        kept_target_annos = prune_annotations(target_annos, max_iou_overlap)
        entry.annotations = other_annos + kept_target_annos
        pruned_annos += len(target_annos) - len(kept_target_annos)
    
    # Iterate over all categories
    target_annos = entry.annotations
    entry.annotations = prune_annotations(target_annos, all_annos_max_iou_overlap)
    pruned_annos += len(target_annos) - len(entry.annotations)

kept_annos -= pruned_annos

print(f"Pruned {pruned_annos} and kept {kept_annos} annotations")

ds.saveToJson(os.path.join(eval_cfg.OUTPUT_DIR, "detection_dataset.pred_2.json"))

In [None]:
if IN_COLAB:
    from google.colab import files
    files.download(pred_dataset_json)

In [None]:
## Back up all content to Google Drive
if IN_COLAB:
    !rm overcooked_models/*/model_0*.pth
    !du -h overcooked_models
    !cp -r overcooked_models/* /content/drive/MyDrive/OvercookedColab/
else:
    # TODO: test this logic
    CKPT_DIR = eval_cfg.OUTPUT_DIR
    !rm $CKPT_DIR/model_0*.pth
    !cp -r $CKPT_DIR /home/mimic/GDrive/OvercookedColab/


In [None]:
## TEST: Export model via tracing
from detectron2.utils.testing import assert_instances_allclose

batch_size = 1

model = predictor.model.to("cuda")
exported_model_path = os.path.join(cfg.OUTPUT_DIR, "model_exported.pt")
# exported_model_path = "model_exported.pt"

dataset = DetectionDataset.loadFromJson(dataset_json)

# Load sample image
image = cv2_imread(os.path.join(SOURCE_DIR, dataset.entries[0].file_name))
image = torch.from_numpy(np.ascontiguousarray(image.transpose(2, 0, 1)))
inputs = tuple(image.clone() for _ in range(batch_size))

def inference_func(model, image):
  inputs = [{"image": image}]
  return model.inference(inputs, do_postprocess=False)[0]

wrapper = TracingAdapter(model, inputs, inference_func)
wrapper.eval()
with torch.no_grad():
    traced_model = torch.jit.trace(wrapper, inputs)
    outputs = inference_func(model, *inputs)
    traced_outputs = wrapper.outputs_schema(traced_model(*inputs))

    if batch_size > 1:
        for output, traced_output in zip(outputs, traced_outputs):
            assert_instances_allclose(output, traced_output, size_as_tensor=True)
    else:
        assert_instances_allclose(outputs, traced_outputs, size_as_tensor=True)
    print("Successfully exported traced model")
  
    traced_model.save(exported_model_path)
    print(f"Saved to {exported_model_path}")

In [None]:
## TEST: Inference speed of traced model

device = "cuda"
loaded_model = torch.jit.load(exported_model_path)

# image = cv2.imread(os.path.join(SOURCE_DIR, dataset.entries[0].file_name))
# image = torch.from_numpy(np.ascontiguousarray(image.transpose(2, 0, 1)))

dts = []
for idx in range(10):
  tic = time.time()
  with torch.no_grad():
    pred_boxes, pred_classes, scores, height_width = loaded_model(image)
  toc = time.time()
  if idx > 1:
    dts.append(toc - tic)
print(f"Did {len(dts)} inferences, avg {np.mean(dts):.3f} sec (std: {np.std(dts):.3f})")

In [None]:
if IN_COLAB:
    from google.colab import files
    files.download(exported_model_path)

In [None]:
## TEST: inference speed of detectron2 model

im = cv2_imread("frame_20210703_162259.789.png")
dts = []
for _ in range(10):
  tic = time.time()
  outputs = predictor(im)
  preds = outputs["instances"].to("cpu")
  toc = time.time()
  dts.append(toc - tic)
print(f"Did {len(dts)} inferences, avg {np.mean(dts):.3f} sec (std: {np.std(dts):.3f})")

In [None]:
## TEST: create a Module with backbone as additional output, and trace it

import collections
from detectron2.structures.image_list import ImageList
from torch import nn

model = predictor.model
model.eval()
im = cv2_imread("frame_20210703_162259.789.png")
image_tensor = torch.from_numpy(np.ascontiguousarray(im.transpose(2, 0, 1)))

class CustomModel(nn.Module):
    Out = collections.namedtuple("Out", ("pred_boxes", "scores", "pred_classes", "fpn_p2"))

    def __init__(self, rcnn):
        super().__init__()
        self.rcnn = rcnn

    def forward(self, x):
        images = self.rcnn.preprocess_image([{"image": x}])
        features = self.rcnn.backbone(images.tensor)
        proposals, _ = self.rcnn.proposal_generator(images, features)
        instances, _ = self.rcnn.roi_heads(images, features, proposals)
        assert(len(instances) == 1)
        return CustomModel.Out(
            pred_boxes=instances[0].pred_boxes.tensor,
            scores=instances[0].scores,
            pred_classes=instances[0].pred_classes,
            fpn_p2=features["p2"]
        )

cmodel = CustomModel(model)
with torch.no_grad():
  output = cmodel(image_tensor)
  traced_cmodel = torch.jit.trace(cmodel, image_tensor)
  traced_output = traced_cmodel(image_tensor)
  assert(np.all([torch.all(output.__getattribute__(k).eq(traced_output[idx])).cpu() for idx, k in enumerate(output._fields)])), "traced model output mismatches"

In [None]:
######
######
# HOMOGRAPHY REGRESSOR MODEL
######
######

import shutil

import torch
import torchvision
from detectron2.modeling import build_model
from torch.utils.tensorboard import SummaryWriter

def vec2H(entry_H_grid_img):
    H_grid_img = np.array(entry_H_grid_img).reshape(3, 3)
    H_grid_img /= H_grid_img[2, 2]
    return H_grid_img


def compute_whiten_H_vec(dataset):
    """
    Usage: H_vec_whitened = (H_vec + H_vec_bias) * H_vec_scale
           H_vec = H_vec_whitened / H_vec_scale - H_vec_bias
    """
    Hs = np.zeros((len(dataset.entries), 8))
    for idx, entry in enumerate(dataset.entries):
        H_grid_img = vec2H(entry.H_grid_img)
        Hs[idx, :] = H_grid_img.flatten()[:8]
    H_vec_bias = -np.mean(Hs, axis=0)
    H_vec_scale = 1/np.std(Hs, axis=0)
    return H_vec_bias, H_vec_scale


class ImageHomographyDataset(torch.utils.data.Dataset):
    def __init__(self, detection_dataset, is_train, transform=None):
        self.transform = transform
        self.ds_all = detection_dataset
        
        self.ds = copy.deepcopy(self.ds_all)
        half_image_idx = int(len(self.ds_all.entries)//2)
        if is_train:
          self.ds.entries = self.ds.entries[:half_image_idx]
        else:
          self.ds.entries = self.ds.entries[half_image_idx:]
    
    def __len__(self):
        return len(self.ds.entries)
    
    def __getitem__(self, entry_idx):
        entry = self.ds.entries[entry_idx]
        img_bgr = cv2_imread(os.path.join(SOURCE_DIR, entry.file_name))
        sample = {
            "image": img_bgr,
            "H_grid_img": vec2H(entry.H_grid_img),
        }
        if self.transform:
            sample = self.transform(sample)
        return sample


# TODO: test if this works and makes sense; remember to keep source image size in order to be able to batch
class RandomTranslate(object):
    """
    Uniformly sample dx/dy, then crop a shifted version of the input image
    """

    def __init__(self, dx_minmax=(0, 100), dy_minmax=(0, 100)):
        self.dx_minmax = dx_minmax
        self.dy_minmax = dy_minmax

    def __call__(self, sample):
        dx = int(np.random.uniform(*self.dx_minmax))
        dy = int(np.random.uniform(*self.dy_minmax))
        source_image = sample["image"]
        cropped_image = source_image[dy:, dx:]
        shifted_image = np.zeros(source_image.shape, source_image.dtype)
        shifted_image[:cropped_image.shape[0], :cropped_image.shape[1]] = cropped_image
        shifted_sample = {
            "image": shifted_image,
            "H_grid_img": sample["H_grid_img"] @ np.array(((1, 0, dx), (0, 1, dy), (0, 0, 1))),
        }
        return shifted_sample


class ToTensor(object):
    def __call__(self, sample):
        tensor_sample = {
            "image": torch.from_numpy(np.ascontiguousarray(sample["image"].transpose(2, 0, 1))),
            "H_vec": torch.from_numpy(sample["H_vec"].astype("float32"))
        }
        return tensor_sample

class HomographyRegressor(torch.nn.Module):
    def __init__(self, rcnn, image_width_height=(1920,1080), global_pooling=False, fpn_p_num_channels=256, fc_size=256):
        super().__init__()
        self.global_pooling = global_pooling
        self.preprocess_image = rcnn.preprocess_image
        self.backbone = rcnn.backbone
        self.backbone.require_grad = False
        self.device = rcnn.device
        input_size = fpn_p_num_channels*5 if global_pooling else int(fpn_p_num_channels*image_width_height[0]*image_width_height[1]*(1/64)*(1/64))
        self.regressor = torch.nn.Sequential(
            torch.nn.Dropout(),
            torch.nn.Linear(input_size, fc_size),
            torch.nn.ReLU(inplace=True),
            torch.nn.Dropout(),
            torch.nn.Linear(fc_size, fc_size),
            torch.nn.ReLU(inplace=True),
            torch.nn.Linear(fc_size, 8),
        ).to(self.device)

    def forward(self, x):
        images = self.preprocess_image(x)
        features = self.backbone(images.tensor)

        fpn_p2_features = features["p2"]
        fpn_p3_features = features["p3"]
        fpn_p4_features = features["p4"]
        fpn_p5_features = features["p5"]
        fpn_p6_features = features["p6"]
        if self.global_pooling:
            feature_vec = torch.cat((
                fpn_p2_features.mean([2, 3]),
                fpn_p3_features.mean([2, 3]),
                fpn_p4_features.mean([2, 3]),
                fpn_p5_features.mean([2, 3]),
                fpn_p6_features.mean([2, 3]),
            ), dim=1)
        else:
            num_samples = len(x)
            feature_vec = torch.cat((
                fpn_p6_features.reshape((num_samples, -1)),
            ), dim=1)
        H_vec = self.regressor(feature_vec)

        return H_vec

#####

dataset = DetectionDataset.loadFromJson(dataset_json)

H_vec_bias, H_vec_scale = compute_whiten_H_vec(dataset)
class FlattenWhitenHomography(object):
    def __call__(self, sample):
        H = sample["H_grid_img"]
        sample["H_vec"] = (H.flatten()[:8] + H_vec_bias) * H_vec_scale
        return sample

train_dataset = ImageHomographyDataset(dataset, is_train=True,
    transform=torchvision.transforms.Compose([
        # RandomTranslate(),  # TODO: experiment
        FlattenWhitenHomography(),
        ToTensor(),
    ]))

test_dataset = ImageHomographyDataset(dataset, is_train=False,
    transform=torchvision.transforms.Compose([
        FlattenWhitenHomography(),
        ToTensor(),
    ]))

train_loader = torch.utils.data.DataLoader(train_dataset, shuffle=True, batch_size=2, num_workers=1, collate_fn=list)

test_loader = torch.utils.data.DataLoader(test_dataset, shuffle=False, batch_size=2, num_workers=1, collate_fn=list)

In [None]:
x = next(iter(train_loader))
images = detectron2_model.preprocess_image(x)
features = detectron2_model.backbone(images.tensor)


In [None]:
detectron2_cfg = get_cfg()
# detectron2_cfg.merge_from_file(model_zoo.get_config_file("COCO-Detection/faster_rcnn_R_50_FPN_1x.yaml"))
# detectron2_cfg.MODEL.WEIGHTS = "overcooked_models/20220103_225637/model_final.pth"
# detectron2_cfg.MODEL.WEIGHTS = os.path.join(eval_cfg.OUTPUT_DIR, "model_final.pth")

detectron2_cfg.merge_from_file("overcooked_models/20220104_163857/model.yaml")

detectron2_model = build_model(detectron2_cfg)
detectron2_model.eval()

model = HomographyRegressor(detectron2_model)

In [None]:
class HomographyRegressorTrainer(object):
    CKPT_NAME = "homography_regressor.pt"
    
    def __init__(self, model, log_dir, lr=4e-5, device_name="cuda:0"):
        self.log_dir = log_dir
        if not os.path.exists(self.log_dir):
            os.makedirs(self.log_dir)
            
        self.writer = SummaryWriter(log_dir=self.log_dir)
        self.device = torch.device(device_name if torch.cuda.is_available() else "cpu")
        self.model = model.to(self.device)
        self.criterion = torch.nn.MSELoss(reduction="mean")
        self.optimizer = torch.optim.Adam(self.model.regressor.parameters(), lr=lr)
        self.scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, step_size=5000, gamma=0.5)
        
        self.iter_idx = 0
    
    @staticmethod
    def load(log_dir, device_name="cuda:0"):
        raise NotImplementedError()
        # ckpt_path = os.path.join(log_dir, HomographyRegressorTrainer.CKPT_NAME)
        # checkpoint = torch.load(ckpt_path)
        # model = GOTURN(load_from_imagenet=False)
        # trainer = GOTURNTrainer(model, log_dir, device_name=device_name)
        # trainer.model.load_state_dict(checkpoint['model_state_dict'])
        # trainer.optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        # trainer.scheduler.load_state_dict(checkpoint['scheduler_state_dict'])
        # trainer.iter_idx = checkpoint['iter_idx']
        # return trainer
    
    def save(self, ckpt_path=None):
        return
        # raise NotImplementedError()
        # if ckpt_path is None:
        #     ckpt_path = os.path.join(self.log_dir, HomographyRegressorTrainer.CKPT_NAME)
        # self.flush_writer()
        # torch.save({
        #     'iter_idx': self.iter_idx,
        #     'model_state_dict': self.model.state_dict(),
        #     'optimizer_state_dict': self.optimizer.state_dict(),
        #     'scheduler_state_dict': self.scheduler.state_dict(),
        #     }, ckpt_path)
        
    def flush_writer(self):
        self.writer.get_file_writer().flush()
        
    def train(self, loader, num_iters=50000, evalloader=None, eval_every_n_iters=5000, save_every_n_iters=1000):
        self.model.train()
        latest_loss = -1
        latest_eval_loss = -1
        iter_pbar = tqdm.tqdm_notebook(total=num_iters, initial=self.iter_idx, desc="train")
        losses = []
        while self.iter_idx < num_iters:
            for batch in loader:
                gt_H_vec = torch.stack(tuple(entry["H_vec"] for entry in batch)).to(self.device)
                self.scheduler.step()
                self.optimizer.zero_grad()
                pred_H_vec = self.model(batch)
                loss = self.criterion(pred_H_vec, gt_H_vec)
                loss.backward()
                self.optimizer.step()
                
                latest_loss = loss.item()
                losses.append(latest_loss)
                lr = self.scheduler.get_last_lr()
                iter_pbar.update()
                iter_pbar.set_postfix(a=lr, eL=latest_eval_loss, tL=latest_loss)
                self.writer.add_scalar("train/lr", lr, self.iter_idx)
                self.writer.add_scalar("train/loss", latest_loss, self.iter_idx)

                self.iter_idx += 1
                if eval_every_n_iters > 0 and evalloader is not None and self.iter_idx % eval_every_n_iters == 0:
                    latest_eval_loss = self.eval(evalloader)["avg_L2"]
                iter_pbar.set_postfix(a=lr, eL=latest_eval_loss, tL=latest_loss)
                if save_every_n_iters > 0 and self.iter_idx % save_every_n_iters == 0:
                    self.save()
                if self.iter_idx >= num_iters:
                    iter_pbar.close()
                    break
        self.flush_writer()
        return losses

    def eval(self, loader):
        self.model.eval()
        total_loss = 0
        batch_count = 0
        entry_count = 0
        batch_pbar = tqdm.tqdm_notebook(loader, desc="eval", leave=False)
        with torch.no_grad():
            for i, batch in enumerate(batch_pbar):
                gt_H_vec = torch.stack(tuple(entry["H_vec"] for entry in batch)).to(self.device)
                pred_H_vec = self.model(batch)
                loss = self.criterion(pred_H_vec, gt_H_vec)
                total_loss += loss.item()
                batch_count += 1
                entry_count += len(batch)
                batch_pbar.set_postfix(l=total_loss/batch_count)
        latest_loss = total_loss/batch_count
        stats = {
            "avg_L2": latest_loss,
        }
        self.writer.add_scalar('eval/loss', latest_loss, self.iter_idx)
        return stats

log_dir = os.path.join("homography_regressor_models", datetime.datetime.now().strftime("%Y%m%d_%H%M%S"))
shutil.rmtree(log_dir, ignore_errors=True)
os.makedirs(log_dir)
trainer = HomographyRegressorTrainer(model=model, log_dir=log_dir, lr=8e-5)

In [None]:
losses = trainer.train(train_loader, num_iters=50000, evalloader=test_loader, eval_every_n_iters=5000)
# TODO: debug why it's taking ~1/1.23 sec per iteration. Forward inference through backbone should be around 0.1 sec. Maybe batch loader? Maybe FCNet? Maybe some other inefficiency?

In [None]:
### ANALYZE: load dataset and track contiguous frames based on category and iou+dist proximity

ds = DetectionDataset.loadFromJson(os.path.join(SOURCE_DIR, "detection_dataset.json"))

tracks = [[] for _ in range(len(ds.thing_classes))]  # tracks[category_id] = list(list(entry_idx, anno_idx, bbox_xyxy), ...)
active_tracks = [[] for _ in range(len(ds.thing_classes))]
iou_threshs = {ds.thing_classes.index(category): 0.1 for category in ("icon_plate_tuna", "icon_plate_shrimp")}

for entry_idx, entry in tqdm.tqdm(enumerate(ds.entries)):
    for category_id in range(len(ds.thing_classes)):
        cat_active_tracks = active_tracks[category_id]
        bboxes_det, anno_idxs = entry.extract_bboxes(category_id)
        bboxes_trk = [track[-1][2] for track in cat_active_tracks]
        if len(bboxes_trk) > 0:
            bboxes_trk = np.stack(bboxes_trk, axis=0)

        min_iou = 0.0 if category_id not in iou_threshs else iou_threshs[category_id]
        matched_det_trk_idx_pairs, unmatched_det_idxs, unmatched_trk_idxs = assign_bbox_matches(bboxes_det, bboxes_trk, min_iou=min_iou)
        for det_idx, trk_idx in matched_det_trk_idx_pairs:
            cat_active_tracks[trk_idx].append((entry_idx, anno_idxs[det_idx], bboxes_det[det_idx]))
        for trk_idx in sorted(list(unmatched_trk_idxs), reverse=True):
            cat_active_tracks.pop(trk_idx)
        for det_idx in unmatched_det_idxs:
            new_track = [(entry_idx, anno_idxs[det_idx], bboxes_det[det_idx])]
            cat_active_tracks.append(new_track)
            tracks[category_id].append(new_track)

print()
for cat_id, track in enumerate(tracks):
    print(f"category {cat_id} ({ds.thing_classes[cat_id]}): {len(track)}")

In [None]:
### EXPLORE: print and visualize temporal tracks for given category

cat_id = 9
msg = f"For category {cat_id} ({ds.thing_classes[cat_id]})"
print(msg)
plt.figure()
for track_idx, track in enumerate(tracks[cat_id]):
    start_entry_idx = track[0][0]
    end_entry_idx = track[-1][0]
    print(f"track idx {track_idx}: {len(track)} (idx=[{start_entry_idx} ... {end_entry_idx}]")
    plt.plot((start_entry_idx, end_entry_idx), (track_idx, track_idx), "-ok")
plt.title(msg)
plt.show()

In [None]:
### VISUALIZE: for given category and track idx, draw frames of track

cat_id, track_idx = 9, 0
images = []  # list(tile_img, label, entry_id)
for entry_idx, anno_idx, bbox_xyxy in tqdm.tqdm(tracks[cat_id][track_idx]):
    entry = ds.entries[entry_idx]
    img = cv2.cvtColor(cv2_imread(os.path.join(SOURCE_DIR, entry.file_name)), cv2.COLOR_BGR2RGB)
    xmin, ymin, xmax, ymax = (int(v) for v in np.round(bbox_xyxy))
    images.append((img[ymin:(ymax+1), xmin:(xmax+1)], anno_idx, entry_idx))
plot_image_tiles(images, num_rows=5, num_cols=10, figsize=(28, 14))

In [None]:
### EXPLORE: test Kalman Filter's accuracy (xmid, ymid, area, aspect_ratio) between prediction and labels

# NOTE: first use cells above to load ds (pred or labels), compute all tracks, and choose category and track

cat_id, track_idx = 9, 0
n_sigma = 2

process_stdevs = (1, 1, 25, 0.02, 32, 24, 100)
measurement_stdevs = (1, 1, 100, 0.02)

min_iou_gt = 0.6
ds_gt = DetectionDataset.loadFromJson(os.path.join(SOURCE_DIR, "detection_dataset.json"))

residuals = {k: [] for k in ("xmid", "ymid", "area", "ar", "entry_idx")}  # residual = pred - gt
debug = {k: [] for k in ("entry_idx", "pred_bbox_xyxy", "gt_bbox_xyxy", "pred_x", "pred_P_diag")}

def get_stats(anno):
    width = float(anno.right - anno.left + 1)
    height = float(anno.bottom - anno.top + 1)

    return {
        "xmid": (anno.right + anno.left) / 2.0,
        "ymid": (anno.bottom + anno.top) / 2.0,
        "area": width * height,
        "ar": width / height,
    }

kf = None
for entry_idx, anno_idx, bbox_xyxy in tqdm.tqdm(tracks[cat_id][track_idx]):
    if kf is None:
        kf = ConstantVelocityBboxKalmanTracker(bbox_xyxy, process_stdevs=process_stdevs, measurement_stdevs=measurement_stdevs)
        continue

    pred_bbox_xyxy, pred_x = kf.predict()
    pred_P = kf.P
    # if entry_idx % 2 == 0:
    kf.update(bbox_xyxy)

    # Find closest ground-truth annotation
    best_anno_gt, best_iou = None, min_iou_gt
    for anno_gt in ds_gt.entries[entry_idx].annotations:
        if anno_gt.category_id != cat_id:
            continue
        iou = ds.entries[entry_idx].annotations[anno_idx].iou(anno_gt)
        if iou > best_iou:
            best_iou = iou
            best_anno_gt = anno_gt
    if anno_gt is None:
        continue

    # Collect residual
    pred_stats = get_stats(BBoxAnnotation(*pred_bbox_xyxy, category_id=cat_id))
    gt_stats = get_stats(best_anno_gt)
    for k in residuals:
        if k == "entry_idx":
            residuals[k].append(entry_idx)  # hack
        else:
            residuals[k].append(pred_stats[k] - gt_stats[k])
    
    debug["entry_idx"].append(entry_idx)
    debug["pred_bbox_xyxy"].append(pred_bbox_xyxy.flatten())
    debug["gt_bbox_xyxy"].append(best_anno_gt.toDict()["bbox"])
    debug["pred_x"].append(pred_x)
    debug["pred_P_diag"].append(np.diag(pred_P))

for k in residuals:
    if k == "entry_idx":
        continue
    mean = np.mean(residuals[k])
    sigma = np.std(residuals[k])
    print(f"residuals (pred - obs) for {k}: u={mean:.3f} / sigma={sigma:.3f}")

entry_idxs = residuals["entry_idx"]
gt_bbox_xyar = np.array([convert_bboxes_xyxy2xyar(np.array(bbox_xyxy)) for bbox_xyxy in debug["gt_bbox_xyxy"]])
pred_x = np.array(debug["pred_x"])
pred_bbox_xyar = pred_x[:, :4]
pred_v_xya = pred_x[:, 4:]
pred_P_diag = np.array(debug["pred_P_diag"])
pred_std_bbox_xyar = np.sqrt(pred_P_diag[:, :4])
pred_std_v_xya = np.sqrt(pred_P_diag[:, 4:])

labels = ("xmid", "ymid", "area", "aspect_ratio", "vx", "vy", "v_area")
plt.figure(figsize=(24, 20))
for plt_idx in range(7):
  plt.subplot(4, 2, plt_idx + 1)
  if plt_idx < 4:
    pred_ys = pred_bbox_xyar[:, plt_idx]
    pred_y_errors = n_sigma * pred_std_bbox_xyar[:, plt_idx]
    plt.fill_between(entry_idxs, pred_ys - pred_y_errors, pred_ys + pred_y_errors, color="plum")
    plt.plot(entry_idxs, pred_ys, "-", color="purple")
    plt.plot(entry_idxs, gt_bbox_xyar[:, plt_idx], "g-")
  else:
    pred_ys = pred_v_xya[:, plt_idx-4]
    pred_y_errors = n_sigma * pred_std_v_xya[:, plt_idx-4]
    plt.fill_between(entry_idxs, pred_ys - pred_y_errors, pred_ys + pred_y_errors, color="plum")
    plt.plot(entry_idxs, pred_ys, "-", color="purple")

  axes = list(plt.axis())
  axis_y_height = axes[3] - axes[2]
  axis_y_mid = (axes[2] + axes[3]) / 2
  if plt_idx < 4:
    axis_y_height *= 0.4
  else:
    axis_y_height *= 0.6
  axes[2] = axis_y_mid - axis_y_height / 2
  axes[3] = axis_y_mid + axis_y_height / 2
  plt.axis(axes)

  plt.xlabel(labels[plt_idx])
plt.show()


In [None]:
### VISUALIZE: load dataset (gt/pred) and visualize tracks based on category and iou+dist proximity

# ds = DetectionDataset.loadFromJson(os.path.join(SOURCE_DIR, "detection_dataset.pred_2.json"))
ds = DetectionDataset.loadFromJson(os.path.join(eval_cfg.OUTPUT_DIR, "detection_dataset.pred_2.json"))

kf_process_stdevs = (2, 2, 40, 0.02, 4, 3, 50)  # tuning for chefs
kf_measurement_stdevs = (1, 1, 25, 0.01)

max_num_kf_predicts = 5
kf_category_ids = set(ds.thing_classes.index(cat) for cat in ("chef", "chef_carrying", "chef_chopping", "plate", "shrimp", "tuna", "cut_shrimp", "cut_tuna", "plate_cut_shrimp", "plate_cut_tuna", "icon_shrimp", "icon_tuna"))

TrackEntry = namedtuple("TrackEntry", ("track_idx", "entry_idx", "anno_idx", "bbox_xyxy", "anno", "kf"))
tracks = [[] for _ in range(len(ds.thing_classes))]
active_tracks = [[] for _ in range(len(ds.thing_classes))]
free_track_idx = 0
iou_threshs = {ds.thing_classes.index(category): 0.1 for category in ("icon_plate_tuna", "icon_plate_shrimp")}
vid_fps = 5.0

# for entry_idx, entry in enumerate(tqdm.tqdm(ds.entries)):  # DEBUG: uncomment below section to debug fn
def make_frame(t):
    global free_track_idx
    entry_idx = int(t * vid_fps)
    entry = ds.entries[entry_idx]

    # Iterate over all categories: update tracks
    for category_id in range(len(ds.thing_classes)):
        # TODO: remove bypass to only track kf categories (i.e. fg)
        if category_id not in kf_category_ids:
          continue

        use_hf = (category_id in kf_category_ids)
        cat_active_tracks = active_tracks[category_id]
        bboxes_det, anno_idxs = entry.extract_bboxes(category_id)
        bboxes_trk = [track[-1].bbox_xyxy for track in cat_active_tracks]
        if len(bboxes_trk) > 0:
            bboxes_trk = np.stack(bboxes_trk, axis=0)

        min_iou = 0.0 if category_id not in iou_threshs else iou_threshs[category_id]
        matched_det_trk_idx_pairs, unmatched_det_idxs, unmatched_trk_idxs = assign_bbox_matches(bboxes_det, bboxes_trk, min_iou=min_iou)
        for det_idx, trk_idx in matched_det_trk_idx_pairs:
            kf = None
            if use_hf:
                kf = cat_active_tracks[trk_idx][-1].kf
                kf.predict()  # NOTE: still need to predict to update KF's state and cov estimates
                kf.update(bboxes_det[det_idx])
            anno_idx = anno_idxs[det_idx]
            cat_active_tracks[trk_idx].append(TrackEntry(
                cat_active_tracks[trk_idx][-1].track_idx,
                entry_idx,
                anno_idx,
                bboxes_det[det_idx],
                entry.annotations[anno_idx],
                kf)
            )
        for trk_idx in sorted(list(unmatched_trk_idxs), reverse=True):
            kf = None
            if use_hf:
                kf = cat_active_tracks[trk_idx][-1].kf
                if kf.num_conseq_predicts < max_num_kf_predicts:
                    pred_bbox_xyxy, _ = kf.predict()
                    cat_active_tracks[trk_idx].append(TrackEntry(
                        cat_active_tracks[trk_idx][-1].track_idx,
                        entry_idx,
                        None,
                        pred_bbox_xyxy,
                        BBoxAnnotation(*pred_bbox_xyxy, category_id, score=-kf.num_conseq_predicts),
                        kf)
                    )
                    continue
                # else pop()
            cat_active_tracks.pop(trk_idx)
        for det_idx in unmatched_det_idxs:
            kf = ConstantVelocityBboxKalmanTracker(bboxes_det[det_idx], process_stdevs=kf_process_stdevs, measurement_stdevs=kf_measurement_stdevs) if use_hf else None
            anno_idx = anno_idxs[det_idx]
            new_track = [TrackEntry(
                free_track_idx,
                entry_idx,
                anno_idx,
                bboxes_det[det_idx],
                entry.annotations[anno_idx],
                kf),
            ]
            free_track_idx += 1
            cat_active_tracks.append(new_track)
            tracks[category_id].append(new_track)
      
    # Generate and return frame
    img = cv2_imread(os.path.join(SOURCE_DIR, entry.file_name))
    track_entries = []
    for cat_active_tracks in active_tracks:
        for track in cat_active_tracks:
            track_entries.append(track[-1])
    track_entries.sort(key=lambda e: (e.anno.top, e.anno.bottom, e.anno.left, e.anno.right))
    annotations = []
    frame_track_idxs = []
    for e in track_entries:
        annotations.append(e.anno)
        frame_track_idxs.append(e.track_idx)
    img_annos = plot_track_annotations(img, annotations, frame_track_idxs, plot_labels)
    return cv2.cvtColor(img_annos, cv2.COLOR_BGR2RGB)

clip = mpy.VideoClip(make_frame, duration=len(ds.entries) / vid_fps - 1e-10)
clip.write_videofile("KF_pred.mp4", fps=vid_fps)

for cat_id, track in enumerate(tracks):
    print(f"category {cat_id} ({ds.thing_classes[cat_id]}): {len(track)}")

In [None]:
if IN_COLAB:
    from google.colab import files
    files.download("KF_pred.mp4")

In [None]:
### VISUALIZATION: Create video of homography from labelled bboxes vs imgproc-solved homography

consensus_max_tile_pxs = 125
vid_fps = 5.0

ds = DetectionDataset.loadFromJson(os.path.join(SOURCE_DIR, "detection_dataset.pred_2.json"))
tile_coords = get_tile_coords(tile_labels, tuple(ds.thing_classes))
num_grid_rows, num_grid_cols = len(tile_labels), len(tile_labels[0])

# grid: map grid_px = (tile_idx*num_pixels_per_tile+1:num_pixels_per_tile) to tile_idx
# more technically: map num_pixels_per_tile/2 to 0, num_pixels_per_tile + num_pixels_per_tile/2 to 1, ...
grid_coords = tile_coords.copy()
grid_coords[:, :2] = grid_coords[:, :2] * num_pixels_per_tile + num_pixels_per_tile / 2
grid_hxys_segments = []
for row_idx in range(num_grid_rows + 1):
    row_px = row_idx * num_pixels_per_tile
    grid_hxys_segments.append((0, row_px, 1.0))
    grid_hxys_segments.append((num_grid_cols * num_pixels_per_tile, row_px, 1.0))
for col_idx in range(num_grid_cols + 1):
    col_px = col_idx * num_pixels_per_tile
    grid_hxys_segments.append((col_px, 0, 1.0))
    grid_hxys_segments.append((col_px, num_grid_rows * num_pixels_per_tile, 1.0))
grid_hxys_segments = np.array(grid_hxys_segments)

def make_frame(t, label_margin_px=2, bbox_thickness=2, font_face=cv2.FONT_HERSHEY_SIMPLEX, font_scale=0.5, font_thickness=2):
    img_idx = int(t * vid_fps)
    bbox_half_thickness = bbox_thickness // 2

    entry = ds.entries[img_idx]
    det_coords = np.array([((anno.left + anno.right)/2, (anno.top + anno.bottom)/2, anno.category_id) for anno in entry.annotations])
    H_img_grid, consensus_matched_det_grid_idxs, det_hxys, grid_hxys = compute_tile_homography(det_coords, grid_coords, consensus_max_tile_pxs)
    H_img_grid_imgproc = np.linalg.inv(np.array(entry.H_grid_img).reshape((3, 3)))
    img_hxys_segments = apply_homography(H_img_grid, grid_hxys_segments)
    img_hxys_segments_imgproc = apply_homography(H_img_grid_imgproc, grid_hxys_segments)
    img_frame = cv2.cvtColor(cv2.imread(os.path.join(SOURCE_DIR, ds.entries[img_idx].file_name)), cv2.COLOR_RGB2BGR)
    img_height, img_width = img_frame.shape[:2]

    for idx in range(0, len(img_hxys_segments_imgproc), 2):
        x1, y1, x2, y2 = img_hxys_segments_imgproc[idx, 0], img_hxys_segments_imgproc[idx, 1], img_hxys_segments_imgproc[idx+1, 0], img_hxys_segments_imgproc[idx+1, 1]
        cv2.line(img_frame, (int(x1), int(y1)), (int(x2), int(y2)), (0, 255, 0), 5, cv2.LINE_AA)
    for idx in range(0, len(img_hxys_segments), 2):
        x1, y1, x2, y2 = img_hxys_segments[idx, 0], img_hxys_segments[idx, 1], img_hxys_segments[idx+1, 0], img_hxys_segments[idx+1, 1]
        cv2.line(img_frame, (int(x1), int(y1)), (int(x2), int(y2)), (255, 0, 255), 2, cv2.LINE_AA)

    for det_idx, grid_idx in consensus_matched_det_grid_idxs:
        grid_col_idx, grid_row_idx = (int(idx) for idx in tile_coords[grid_idx, :2])
        anno = entry.annotations[det_idx]
        xmin, ymin, xmax, ymax = int(anno.left), int(anno.top), int(anno.right), int(anno.bottom)
        xmid, ymid = (xmin + xmax) // 2, (ymin + ymax) // 2
        cv2.rectangle(img_frame, (xmin, ymin), (xmax, ymax), (0, 0, 0), 5, cv2.LINE_AA)
        cv2.rectangle(img_frame, (xmin, ymin), (xmax, ymax), (255, 255, 255), 2, cv2.LINE_AA)

    for det_idx, grid_idx in consensus_matched_det_grid_idxs:
        grid_col_idx, grid_row_idx = (int(idx) for idx in tile_coords[grid_idx, :2])
        category = tile_labels[grid_row_idx][grid_col_idx]
        category = plot_labels[ds.thing_classes.index(category)]
        anno = entry.annotations[det_idx]
        xmin, ymin, xmax, ymax = int(anno.left), int(anno.top), int(anno.right), int(anno.bottom)
        xmid, ymid = (xmin + xmax) // 2, (ymin + ymax) // 2

        coord_str = f"{category}({grid_row_idx},{grid_col_idx})"
        (coord_width, coord_height_wo_bl), coord_baseline = cv2.getTextSize(coord_str, font_face, font_scale, font_thickness)
        coord_height = coord_height_wo_bl + coord_baseline
        bbox_tl_px = (int(xmid - coord_width/2.0), int(ymid - coord_height/2.0))
        coord_tl_px = (bbox_tl_px[0] - bbox_half_thickness, bbox_tl_px[1] - bbox_half_thickness)
        coord_br_px = (bbox_tl_px[0] + coord_width + 2*label_margin_px - bbox_half_thickness, bbox_tl_px[1] + coord_height + 2*label_margin_px - bbox_half_thickness)
        cv2.rectangle(img_frame, coord_tl_px, coord_br_px, (255, 255, 255), cv2.FILLED)

    for det_idx, grid_idx in consensus_matched_det_grid_idxs:
        grid_col_idx, grid_row_idx = (int(idx) for idx in tile_coords[grid_idx, :2])
        category = tile_labels[grid_row_idx][grid_col_idx]
        category = plot_labels[ds.thing_classes.index(category)]
        anno = entry.annotations[det_idx]
        xmin, ymin, xmax, ymax = int(anno.left), int(anno.top), int(anno.right), int(anno.bottom)
        xmid, ymid = (xmin + xmax) // 2, (ymin + ymax) // 2

        coord_str = f"{category}({grid_row_idx},{grid_col_idx})"
        (coord_width, coord_height_wo_bl), coord_baseline = cv2.getTextSize(coord_str, font_face, font_scale, font_thickness)
        coord_height = coord_height_wo_bl + coord_baseline
        bbox_tl_px = (int(xmid - coord_width/2.0), int(ymid - coord_height/2.0))
        coord_text_px = (bbox_tl_px[0] + label_margin_px - bbox_half_thickness, bbox_tl_px[1] + coord_height_wo_bl + label_margin_px - bbox_half_thickness)
        cv2.putText(img_frame, coord_str, coord_text_px, font_face, font_scale, (0, 0, 0), font_thickness, cv2.LINE_AA)
        
    title_str = f"{img_idx}: {entry.file_name} (green=H_imgproc, purple=H_autocorr)"
    (title_width, title_height_wo_bl), title_baseline = cv2.getTextSize(title_str, font_face, font_scale, font_thickness)
    bbox_tl_px = (20, 20)
    title_text_px = (bbox_tl_px[0] + label_margin_px - bbox_half_thickness, bbox_tl_px[1] + title_height_wo_bl + label_margin_px - bbox_half_thickness)
    title_tl_px = (bbox_tl_px[0] - bbox_half_thickness, bbox_tl_px[1] - bbox_half_thickness)
    title_br_px = (bbox_tl_px[0] + title_width + 2*label_margin_px - bbox_half_thickness, bbox_tl_px[1] + title_height_wo_bl + title_baseline + 2*label_margin_px - bbox_half_thickness)
    cv2.rectangle(img_frame, title_tl_px, title_br_px, (255, 255, 255), cv2.FILLED)
    cv2.putText(img_frame, title_str, title_text_px, font_face, font_scale, (0, 0, 0), font_thickness, cv2.LINE_AA)

    return img_frame

clip = mpy.VideoClip(make_frame, duration=len(ds.entries) / vid_fps - 1e-10)
clip.write_videofile("H_autocorr.mp4", fps=vid_fps)

# Test plotting
# plt.figure(figsize=(20, 15))
# plt.imshow(make_frame(0))

In [None]:
if IN_COLAB:
    from google.colab import files
    files.download("H_autocorr.mp4")