# Image encoding benchmark

Visualize pretrainer results on a given image and measure the encoding time.

First, build the model:

In [7]:
import sys
sys.path.append("../")
from multitudinous.utils.model_builder import build_img_backbone
import torch

BACKBONE_VARIANT = "resnet50"
BATCH_SIZE = 4

# build the model
model = build_img_backbone(BACKBONE_VARIANT, 4, "../resnet50_unet/img_backbone_8.pth")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

model.eval()

total_params = sum(p.numel() for p in model.parameters())

# print the model
print(model)

print(f"Total number of parameters: {total_params}")

ResNet50(
  (conv1): Conv2d(4, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BottleneckBlock(
      (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (dropout1): Dropout2d(p=0.5, inplace=False)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (dropout2): Dropout2d(p=0.5, inplace=False)
      (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (dropout3): Dropout2d(p=0.5, inplace=Fals

Now, create the dataset and dataloader instances:

In [8]:
from multitudinous.utils.dataset_builder import build_img_dataset

# build the dataset
_, _, test_set = build_img_dataset("tum_rgbd", "../../data/tum_rgbd1", "train", "val", "test")
# create the dataloader
test_loader = torch.utils.data.DataLoader(test_set, batch_size=BATCH_SIZE, shuffle=True, pin_memory=True)

Now, run inference on a given image:

In [9]:
import torch

# create the timers
starter = torch.cuda.Event(enable_timing=True)
ender = torch.cuda.Event(enable_timing=True)

NUM_BENCHMARKING_SAMPLES=100
NUM_WARMUP_SAMPLES=10

# metrics
total_imgs = len(test_set)
total_elapsed_time = 0
curr_sample = 0
avg_encoding_time = 0

for rgb, depth in test_loader:

    # move the images to the device
    rgb = rgb.to(device)
    depth = depth.to(device)
    depth = depth.unsqueeze(1)
    rgbd = torch.cat((rgb, depth), dim=1)

    # run inference
    with torch.no_grad():
        starter.record() # start recording
        output = model(rgbd)
        ender.record()
        torch.cuda.synchronize()
        encoding_time = starter.elapsed_time(ender)

    curr_sample += BATCH_SIZE

    if curr_sample > NUM_WARMUP_SAMPLES:
        total_elapsed_time += encoding_time

        avg_encoding_time = total_elapsed_time/(curr_sample-NUM_WARMUP_SAMPLES)

        print(f"\rSample {curr_sample-NUM_WARMUP_SAMPLES}/{NUM_BENCHMARKING_SAMPLES} - Avg. encoding time: {avg_encoding_time} ms", end=" ")

    if curr_sample-NUM_WARMUP_SAMPLES > NUM_BENCHMARKING_SAMPLES:
        break

print(f"\n\n** {BACKBONE_VARIANT} METRICS (GPU) **")
print(f"Avg. encoding time: {avg_encoding_time} ms")
print(f"Avg. throughput: {1/(avg_encoding_time/1000)} fps")

del rgb, depth, rgbd, output

Sample 90/100 - Avg. encoding time: 69.32656572129991 ms 

KeyboardInterrupt: 

Now, perform the same benchmarking in CPU:

In [None]:
import time

# metrics
total_elapsed_time = 0
curr_sample = 0
avg_encoding_time = 0

model = model.to("cpu")
model.eval()

for rgb, depth in dataloader:

    # move the images to the device
    depth = depth.unsqueeze(1)
    rgbd = torch.cat((rgb, depth), dim=1)

    # run inference
    with torch.no_grad():
        start = time.time()
        output = model(rgbd)
        end = time.time()
        torch.cuda.synchronize()
        encoding_time = (end - start)*1000 # in ms

    curr_sample += BATCH_SIZE

    if curr_sample > NUM_WARMUP_SAMPLES:

        total_elapsed_time += encoding_time

        avg_encoding_time = total_elapsed_time/(curr_sample-NUM_WARMUP_SAMPLES)

        print(f"\rSample {curr_sample-NUM_WARMUP_SAMPLES}/{NUM_BENCHMARKING_SAMPLES} - Avg. encoding time: {avg_encoding_time} ms", end=" ")

    if curr_sample-NUM_WARMUP_SAMPLES > NUM_BENCHMARKING_SAMPLES:
        break

print(f"\n\n** {BACKBONE_VARIANT} METRICS (CPU) **")
print(f"Avg. encoding time: {avg_encoding_time} ms")
print(f"Avg. throughput: {1/(avg_encoding_time/1000)} fps")

del rgb, depth, rgbd, output

Sample 102/100 - Avg. encoding time: 705.8496568717209 ms 

** resnet50 METRICS (CPU) **
Avg. encoding time: 705.8496568717209 ms
Avg. throughput: 1.4167322889012004 fps
