# Image encoding benchmark

Visualize pretrainer results on a given image and measure the encoding time.

First, load the FP32 model:

In [1]:
import sys
sys.path.append("../")
from multitudinous.utils.model_builder import build_img_backbone
import torch

BACKBONE_VARIANT = "resnet50"
BATCH_SIZE = 4

# build the FP32 model
model_fp32 = build_img_backbone(BACKBONE_VARIANT, in_channels=4)
model_fp32.eval()

ResNet50(
  (conv1): Conv2d(4, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BottleneckBlock(
      (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (shortcut): Sequential(
        (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (1): BatchNorm2d(256, eps=1e-05, 

Now, quantize the model:

In [2]:
from multitudinous.utils.quantization import get_quantized_model

model_q = get_quantized_model(model_fp32, 'x86')
model_q.load_state_dict(torch.load("../resnet_q/resnet50_quantized.pth"))
model_q.eval()

  device=storage.device,


ResNet50(
  (conv1): QuantizedConv2d(4, 64, kernel_size=(7, 7), stride=(2, 2), scale=0.05918791890144348, zero_point=64, padding=(3, 3), bias=False)
  (bn1): QuantizedBatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BottleneckBlock(
      (conv1): QuantizedConv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), scale=0.16583716869354248, zero_point=60, bias=False)
      (bn1): QuantizedBatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): QuantizedConv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), scale=0.47362759709358215, zero_point=63, padding=(1, 1), bias=False)
      (bn2): QuantizedBatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): QuantizedConv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), scale=0.1362021416425705, zero_point=74, bias=False)
      (bn3)

Now, create the dataset and dataloader instances:

In [3]:
from multitudinous.utils.dataset_builder import build_img_dataset

# build the dataset
dataset = build_img_dataset("tum_rgbd", "../../data/tum_rgbd1")
# create the dataloader
dataloader = torch.utils.data.DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)

Now, run inference on a given image:

In [4]:
import time

NUM_BENCHMARKING_SAMPLES=100
NUM_WARMUP_SAMPLES=10

# metrics
total_imgs = len(dataset)
total_elapsed_time = 0
curr_sample = 0
avg_encoding_time = 0

for rgb, depth in dataloader:

    depth = depth.unsqueeze(1)
    rgbd = torch.cat((rgb, depth), dim=1)

    # run inference
    with torch.no_grad():
        start = time.time()
        output = model_q(rgbd)
        end = time.time()
        encoding_time = (end - start) * 1000

    curr_sample += BATCH_SIZE

    if curr_sample > NUM_WARMUP_SAMPLES:

        total_elapsed_time += encoding_time

        avg_encoding_time = total_elapsed_time/(curr_sample-NUM_WARMUP_SAMPLES)

        print(f"\rSample {curr_sample-NUM_WARMUP_SAMPLES}/{NUM_BENCHMARKING_SAMPLES} - Avg. encoding time: {avg_encoding_time} ms", end=" ")

    if curr_sample-NUM_WARMUP_SAMPLES >= NUM_BENCHMARKING_SAMPLES:
        break

print(f"\n\n** QUANTIZED {BACKBONE_VARIANT} METRICS (CPU) **")
print(f"Avg. encoding time: {avg_encoding_time} ms")
print(f"Avg. throughput: {1/(avg_encoding_time/1000)} fps")

del rgb, depth, rgbd, output

Sample 102/100 - Avg. encoding time: 166.688460929721 ms  

** QUANTIZED resnet50 METRICS (CPU) **
Avg. encoding time: 166.688460929721 ms
Avg. throughput: 5.999215509114448 fps
