### Define model and imports

In [None]:
# Imports
import os
import time
import numpy as np
from loguru import logger
import cv2
import matplotlib.pyplot as plt
from typing import Tuple, List

import torch
import torchvision
import albumentations as A
from albumentations.pytorch import ToTensorV2
import imageio
from tqdm import tqdm
from lit_yolox import LitYOLOX

def write2mp4(path, frames, fps=10):
    writer = imageio.get_writer(path, fps=fps)

    for f in frames:
        writer.append_data(f)
    writer.close()

In [None]:
# Images that simulate a ground vehicle view
image1 = cv2.imread("/home/bbikdash/Development/1_object_detection/eotacs_demo_april_2023_car_detection/test_data/downward_vio_bin/image_raw/1332343552747.png", 1)
image2 = cv2.imread("/home/bbikdash/Development/1_object_detection/eotacs_demo_april_2023_car_detection/test_data/downward_vio_bin/image_raw/895651153410.png", 1)
image3 = cv2.imread("/home/bbikdash/Development/1_object_detection/eotacs_demo_april_2023_car_detection/test_data/downward_vio_bin/image_raw/887585870861.png", 1)
image4 = cv2.imread("/home/bbikdash/Development/1_object_detection/eotacs_demo_april_2023_car_detection/test_data/downward_vio_bin/image_raw/1097816758502.png", 1)
image5 = cv2.imread("/home/bbikdash/Development/1_object_detection/eotacs_demo_april_2023_car_detection/test_data/downward_vio_bin/image_raw/1219329453064.png", 1)

image6 = cv2.imread("/mnt/data/Datasets/KEF_Vesper_Vehicle_Evaluation_Datasets/demo/187719784054.png", 1)
image7 = cv2.imread("/mnt/data/Datasets/KEF_Vesper_Vehicle_Evaluation_Datasets/demo/190795563947.png", 1)
image8 = cv2.imread("/mnt/data/Datasets/KEF_Vesper_Vehicle_Evaluation_Datasets/demo/236421139624.png", 1)
image9 = cv2.imread("/mnt/data/Datasets/KEF_Vesper_Vehicle_Evaluation_Datasets/demo/187853094730.png", 1)


# image10 = cv2.imread("/mnt/data/Datasets/UAV-Vehicle-Detection-Dataset/data/train/DJI-00760-00022.jpg", 1)
image11 = cv2.imread("/mnt/data/Datasets/UAV-Vehicle-Detection-Dataset/data/val/MOS86.png", 1)


image12 = cv2.imread("/mnt/data/Datasets/Building_Detection/eo/ETG/11_17_08/11_17_08_img95.png")
image13 = cv2.imread("/mnt/data/Datasets/Building_Detection/eo/ETG/12_45_35/12_45_35_img157.png")

### Load Lightning Weights

In [None]:
device = torch.device("cuda:0")
model = LitYOLOX.load_from_checkpoint("./logs/yolox_s_eo_building/inria_pretrain/version_4/checkpoints/epoch=85-step=26230.ckpt")
model.to(device)
logger.info(model.device)

val_input_size = [640, 640]
transforms = A.Compose([
    # Geometric transformations
    A.Resize(val_input_size[0], val_input_size[1], always_apply=True), 
    # A.SmallestMaxSize(val_input_size[0], always_apply=True),
    # A.CenterCrop(val_input_size[0], val_input_size[1], always_apply=True),
    ToTensorV2(),
])
image = image13

pred = model.inference(image, 0.25, 0.5, transforms)

# Visualize original image, inference without post-processing, and image with post-processing
figure, ax = plt.subplots(nrows=1, ncols=2, figsize=(10, 10), frameon=True, layout="tight", dpi=300)
ax[0].imshow(cv2.cvtColor(image, cv2.COLOR_BGR2RGB)) ; ax[0].set_title("Original Image") ; ax[0].set_axis_off()
ax[1].imshow(cv2.cvtColor(pred, cv2.COLOR_BGR2RGB)) ; ax[1].set_title("Bounding Box Prediction") ; ax[1].set_axis_off()

# Save image as an asset for README
# Bounds of the plotting function
# h,w,c = image.shape
# horizontal_bounds = [-w//2, w//2]
# vertical_bounds = [-h//2, h//2]
# figure, ax = plt.subplots(figsize=(10,10), frameon=False, layout='tight')
# ax.imshow(cv2.cvtColor(pred, cv2.COLOR_BGR2RGB), extent=[horizontal_bounds[0], horizontal_bounds[1], vertical_bounds[0], vertical_bounds[1]])
# ax.set_xlim(horizontal_bounds[0], horizontal_bounds[1])
# ax.set_ylim(vertical_bounds[0], vertical_bounds[1])
# ax.set_axis_off()
# plt.gca()
# plt.subplots_adjust(top = 1, bottom = 0, right = 1, left = 0, 
#             hspace = 0, wspace = 0)
# plt.margins(0,0)
# plt.savefig("./car_detections_asset.png", bbox_inches='tight', pad_inches=0) # After years: https://stackoverflow.com/questions/11837979/removing-white-space-around-a-saved-image

### YOLOX ONNX Inference

In [None]:
# define preprocess parameters
mean = np.array([0.485, 0.456, 0.406]) * 255.0
scale_factor = 1 / (255.0)
std = [0.229, 0.224, 0.225]
height, width = 640, 640
scale = np.array([[width, height, width, height]])
CLASS_LABELS = ['vehicle']
CLASS_IND = np.arange(1)
class_colors = [(1.0, 0.0, 0.0) for _ in range(5)]

# num_classes = 1
# confidence_thresh = 0.15
# overlap_thresh = 0.1
# iou_thresh = 0.75
# nms_threshold = 0.5
GRID = None
def _pre_process(image: np.ndarray) -> np.ndarray:
    # Accepts raw RGB image loaded from opencv. Dim: HxWx3
    # prepare input blob to fit the model input:
    # 1. subtract mean
    # 2. scale to set pixel values from 0 to 1
    input_blob = cv2.dnn.blobFromImage(
        image=image,
        # scalefactor=scale_factor,
        size=(height, width),  # img target size
        # mean=mean,
        swapRB=False,  # BGR -> RGB
        # crop=True  # center crop
    )
    # 3. divide by std
    # input_blob[0] /= np.asarray(std, dtype=np.float32).reshape(3, 1, 1)
    return input_blob

_TORCH_VER = [int(x) for x in torch.__version__.split(".")[:2]]
def meshgrid(*tensors):
    if _TORCH_VER >= [1, 10]:
        return torch.meshgrid(*tensors, indexing="ij")
    else:
        return torch.meshgrid(*tensors)
    
def decode_outputs(outputs, dtype):
    # This is determined by the input size of the image.
    hw =  [(80, 80),
           (40, 40),
           (20, 20),
    ]   # I don't know what this is. It was created with this: hw = [x.shape[-2:] for x in outputs]

    arch_strides = [8, 16, 32] # See yolox architectures for the appropriate strides

    grids = []
    strides = []
    for (hsize, wsize), stride in zip(hw, arch_strides):
        yv, xv = meshgrid([torch.arange(hsize), torch.arange(wsize)])
        grid = torch.stack((xv, yv), 2).view(1, -1, 2)
        grids.append(grid)
        shape = grid.shape[:2]
        strides.append(torch.full((*shape, 1), stride))

    grids = torch.cat(grids, dim=1).type(dtype)
    strides = torch.cat(strides, dim=1).type(dtype)
    
    np.savetxt('./grids.txt', grids[0].detach().cpu().numpy().astype(np.int32))


    print(grids[0, 8000:8010])
    print(strides[0, 8000:8010])

    outputs = torch.cat([
        (outputs[..., 0:2] + grids) * strides,
        torch.exp(outputs[..., 2:4]) * strides,
        outputs[..., 4:]
    ], dim=-1)
    return outputs


def postprocess(prediction, num_classes, conf_thre=0.7, nms_thre=0.45, class_agnostic=False):
    # box_corner = prediction.new(prediction.shape)
    box_corner = torch.zeros_like(prediction)
    box_corner[:, :, 0] = prediction[:, :, 0] - prediction[:, :, 2] / 2
    box_corner[:, :, 1] = prediction[:, :, 1] - prediction[:, :, 3] / 2
    box_corner[:, :, 2] = prediction[:, :, 0] + prediction[:, :, 2] / 2
    box_corner[:, :, 3] = prediction[:, :, 1] + prediction[:, :, 3] / 2
    prediction[:, :, :4] = box_corner[:, :, :4]

    output = [None for _ in range(len(prediction))]
    for i, image_pred in enumerate(prediction):

        # If none are remaining => process next image
        if not image_pred.size(0):
            continue
        # Get score and class with highest confidence
        class_conf, class_pred = torch.max(image_pred[:, 5: 5 + num_classes], 1, keepdim=True)

        conf_mask = (image_pred[:, 4] * class_conf.squeeze() >= conf_thre).squeeze()
        # Detections ordered as (x1, y1, x2, y2, obj_conf, class_conf, class_pred)
        detections = torch.cat((image_pred[:, :5], class_conf, class_pred.float()), 1)
        detections = detections[conf_mask]
        if not detections.size(0):
            continue

        if class_agnostic:
            nms_out_index = torchvision.ops.nms(
                detections[:, :4],
                detections[:, 4] * detections[:, 5],
                nms_thre,
            )
        else:
            nms_out_index = torchvision.ops.batched_nms(
                detections[:, :4],
                detections[:, 4] * detections[:, 5],
                detections[:, 6],
                nms_thre,
            )

        detections = detections[nms_out_index]
        if output[i] is None:
            output[i] = detections
        else:
            output[i] = torch.cat((output[i], detections))

    return output

    
def vis(img, boxes, scores, cls_ids, conf=0.5, class_names: List[str] = None):

    for i in range(len(boxes)):
        box = boxes[i]
        cls_id = int(cls_ids[i])
        score = scores[i]
        if score < conf:
            continue
        x0 = int(box[0])
        y0 = int(box[1])
        x1 = int(box[2])
        y1 = int(box[3])

        color = (_COLORS[cls_id] * 255).astype(np.uint8).tolist()
        text = '{}:{:.1f}%'.format(class_names[cls_id], score * 100)
        txt_color = (0, 0, 0) if np.mean(_COLORS[cls_id]) > 0.5 else (255, 255, 255)
        font = cv2.FONT_HERSHEY_SIMPLEX

        txt_size = cv2.getTextSize(text, font, 0.4, 1)[0]
        cv2.rectangle(img, (x0, y0), (x1, y1), color, 2)

        txt_bk_color = (_COLORS[cls_id] * 255 * 0.7).astype(np.uint8).tolist()
        cv2.rectangle(
            img,
            (x0, y0 + 1),
            (x0 + txt_size[0] + 1, y0 + int(1.5*txt_size[1])),
            txt_bk_color,
            -1
        )
        cv2.putText(img, text, (x0, y0 + txt_size[1]), font, 0.4, txt_color, thickness=1)

    return img

_COLORS = np.array(
    [
        0.000, 0.447, 0.741,
        0.850, 0.325, 0.098,
        0.929, 0.694, 0.125,
        0.494, 0.184, 0.556,
        0.466, 0.674, 0.188,
        0.301, 0.745, 0.933,
        0.635, 0.078, 0.184,
        0.300, 0.300, 0.300,
        0.600, 0.600, 0.600,
        1.000, 0.000, 0.000,
        1.000, 0.500, 0.000,
        0.749, 0.749, 0.000,
        0.000, 1.000, 0.000,
        0.000, 0.000, 1.000,
        0.667, 0.000, 1.000,
        0.333, 0.333, 0.000,
        0.333, 0.667, 0.000,
        0.333, 1.000, 0.000,
        0.667, 0.333, 0.000,
        0.667, 0.667, 0.000,
        0.667, 1.000, 0.000,
        1.000, 0.333, 0.000,
        1.000, 0.667, 0.000,
        1.000, 1.000, 0.000,
        0.000, 0.333, 0.500,
        0.000, 0.667, 0.500,
        0.000, 1.000, 0.500,
        0.333, 0.000, 0.500,
        0.333, 0.333, 0.500,
        0.333, 0.667, 0.500,
        0.333, 1.000, 0.500,
        0.667, 0.000, 0.500,
        0.667, 0.333, 0.500,
        0.667, 0.667, 0.500,
        0.667, 1.000, 0.500,
        1.000, 0.000, 0.500,
        1.000, 0.333, 0.500,
        1.000, 0.667, 0.500,
        1.000, 1.000, 0.500,
        0.000, 0.333, 1.000,
        0.000, 0.667, 1.000,
        0.000, 1.000, 1.000,
        0.333, 0.000, 1.000,
        0.333, 0.333, 1.000,
        0.333, 0.667, 1.000,
        0.333, 1.000, 1.000,
        0.667, 0.000, 1.000,
        0.667, 0.333, 1.000,
        0.667, 0.667, 1.000,
        0.667, 1.000, 1.000,
        1.000, 0.000, 1.000,
        1.000, 0.333, 1.000,
        1.000, 0.667, 1.000,
        0.333, 0.000, 0.000,
        0.500, 0.000, 0.000,
        0.667, 0.000, 0.000,
        0.833, 0.000, 0.000,
        1.000, 0.000, 0.000,
        0.000, 0.167, 0.000,
        0.000, 0.333, 0.000,
        0.000, 0.500, 0.000,
        0.000, 0.667, 0.000,
        0.000, 0.833, 0.000,
        0.000, 1.000, 0.000,
        0.000, 0.000, 0.167,
        0.000, 0.000, 0.333,
        0.000, 0.000, 0.500,
        0.000, 0.000, 0.667,
        0.000, 0.000, 0.833,
        0.000, 0.000, 1.000,
        0.000, 0.000, 0.000,
        0.143, 0.143, 0.143,
        0.286, 0.286, 0.286,
        0.429, 0.429, 0.429,
        0.571, 0.571, 0.571,
        0.714, 0.714, 0.714,
        0.857, 0.857, 0.857,
        0.000, 0.447, 0.741,
        0.314, 0.717, 0.741,
        0.50, 0.5, 0
    ]
).astype(np.float32).reshape(-1, 3)

In [None]:
# Define image pre-processing transformations
# Load detection model
model_path = "/home/bbikdash/Development/1_object_detection/kef_yolox/weights/yolox_s_eo_building.onnx"
opencv_net = cv2.dnn.readNetFromONNX(model_path)

image = image13
logger.info(f"Image Shape: {image.shape}")

# Measure pre-processing time
start_preproc = time.time()
network_input = _pre_process(image)
logger.info(f"Network Input Shape: {network_input.shape}")
end_preproc = time.time()

# Measure inference time
start_inference = time.time()

# set OpenCV DNN input
opencv_net.setInput(network_input)
# OpenCV DNN inference
output = opencv_net.forward()
logger.info(f"Network Output Shape: {output.shape}")

decode_outputs = decode_outputs(torch.from_numpy(output), dtype=torch.float)
detections = postprocess(decode_outputs, num_classes=1, conf_thre=0.25, class_agnostic=True)[0]
end_inference = time.time()


start_draw_time = time.time()
background = network_input[0].transpose(1,2,0).copy()
visualization_img = network_input[0].transpose(1,2,0).copy()

if detections != None:
      detections = detections.detach().cpu().numpy()
      bboxes = detections[:, 0:4]
      cls = detections[:, 6]
      scores = detections[:, 4] * detections[:, 5]
      print(bboxes)
      visualization_img = vis(visualization_img, bboxes, scores, cls, 0.1, ['vehicle'])

      # Visualize bounding boxes in [0,255] range
end_draw_time = time.time()

# Normalize images for visualization
visualization_img = (visualization_img - visualization_img.min()) / (visualization_img.max() - visualization_img.min())
background = (background - background.min()) / (background.max() - background.min())

# print("OpenCV DNN prediction: \n")
# print("\toutput: ", output.shape)
# print("\tbboxes: ", bboxes.shape)
# print("\tscores: ", scores.shape)
# print("\tclass_indices: ", class_indices.shape)

# Visualize original image, inference without post-processing, and image with post-processing
figure, ax = plt.subplots(nrows=1, ncols=2, figsize=(10, 10), frameon=True, layout="tight", dpi=300)
ax[0].imshow(cv2.cvtColor(background, cv2.COLOR_BGR2RGB)) ; ax[0].set_title("Network Input") ; ax[0].set_axis_off()
ax[1].imshow(cv2.cvtColor(visualization_img, cv2.COLOR_BGR2RGB)) ; ax[1].set_title("Prediction") ; ax[1].set_axis_off()
# ax[2].imshow(mask_proc) ; ax[2].set_title("With Preprocessing") ; ax[2].set_axis_off()

print(f"Pre-Processing time: {end_preproc-start_preproc}\n"
      f"Inference time: {end_inference-start_inference}\n"
      # f"Post-Processing time: {end_postproc-start_postproc}\n"
      f"Drawing time: {end_draw_time-start_draw_time}\n"
      f"\tPipeline Time: {end_draw_time-start_preproc}\n")
      # f"{with_post_proc.shape}, {np.max(with_post_proc)}, {np.min(with_post_proc)}")