In [10]:
import time
import numpy as np
import os
import torch
import torchvision.models as models
import torch_tensorrt
from torch_tensorrt._enums import memory_format
from torch.profiler import profile, record_function, ProfilerActivity
import warnings

warnings.filterwarnings(
    "ignore",
    message="To copy construct from a tensor, it is recommended to use sourceTensor.detach\\(\\).clone\\(\\)",
    category=UserWarning
)

In [11]:
# Prepare models, one for fp32 and another for fp16
base_model = models.resnet18(pretrained=True).eval().cuda()
base_model = base_model.to(memory_format=torch.channels_last)
model_fp16 = models.resnet18(pretrained=True).eval().cuda()
model_fp16 = model_fp16.to(memory_format=torch.channels_last)
model_fp16 = model_fp16.half() # for fp16





In [17]:
def benchmark_model(model, input_tensor,
                    warmup_iters: int = 15,
                    timed_iters: int = 100):
    # Warm‑up
    for _ in range(warmup_iters):
        _ = model(input_tensor)
    torch.cuda.synchronize()

    times = []
    for _ in range(timed_iters):
        t0 = time.time()
        _ = model(input_tensor)
        torch.cuda.synchronize()
        t1 = time.time()
        times.append((t1 - t0) * 1000)  # ms

    times = np.array(times)
    print(f"    → mean: {times.mean():.2f} ms  ±  std: {times.std():.2f} ms")
    return times

In [19]:
for batch_size in [1, 8, 16, 32]:
    print(f"\n=== Batch size {batch_size} ===")
    input_shape = (batch_size, 3, 224, 224)
    input_tensor = torch.randn(input_shape, device="cuda") \
                         .to(memory_format=torch.channels_last)

    # prepare output dirs
    for tag in ["pytorch", "trt_fp32", "trt_fp16"]:
        run_dir = f"tb_logs/{tag}_bs{batch_size}"
        os.makedirs(os.path.join(run_dir, "plugins", "profile"), exist_ok=True)

    # ----- Benchmark raw latencies -----
    print(" Timing  (ms):")
    print("  PyTorch FP32:", end="")
    benchmark_model(base_model, input_tensor)
    print("  TRT FP32:   ", end="")
    # compile once to get trt_model
    trt_model = torch_tensorrt.compile(
        base_model,
        inputs=[torch_tensorrt.Input(
            min_shape=input_shape,
            opt_shape=input_shape,
            max_shape=input_shape,
            dtype=torch.float32,
            format=torch.channels_last
        )],
        enabled_precisions={torch.float32},
        method="fx",
        workspace_size=1 << 32
    )
    benchmark_model(trt_model, input_tensor)

    print("  TRT FP16:   ", end="")
    trt_model_fp16 = torch_tensorrt.compile(
        model_fp16,
        inputs=[torch_tensorrt.Input(
            min_shape=input_shape,
            opt_shape=input_shape,
            max_shape=input_shape,
            dtype=torch.float16,
            format=torch.channels_last
        )],
        enabled_precisions={torch.float16},
        method="fx",
        workspace_size=1 << 32
    )
    input_tensor_fp16 = input_tensor.half()
    benchmark_model(trt_model_fp16, input_tensor_fp16)

    # ----- Profiling traces -----
    # PyTorch FP32
    run_dir = f"tb_logs/pytorch_bs{batch_size}"
    with profile(
        activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
        record_shapes=True, profile_memory=True,
        schedule=torch.profiler.schedule(wait=1, warmup=1, active=3, repeat=1),
        on_trace_ready=torch.profiler.tensorboard_trace_handler(run_dir),
        with_stack=True
    ) as prof:
        for _ in range(50):
            with record_function("pytorch_inference"):
                _ = base_model(input_tensor)
            prof.step()
    torch.cuda.synchronize()
    print(f" PyTorch profiling trace saved for batch {batch_size}")

    # TRT FP32
    run_dir = f"tb_logs/trt_fp32_bs{batch_size}"
    with profile(
        activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
        record_shapes=True, profile_memory=True,
        schedule=torch.profiler.schedule(wait=1, warmup=1, active=3, repeat=1),
        on_trace_ready=torch.profiler.tensorboard_trace_handler(run_dir),
        with_stack=True
    ) as prof:
        for _ in range(50):
            with record_function("trt_fp32_inference"):
                _ = trt_model(input_tensor)
            prof.step()
    torch.cuda.synchronize()
    print(f" TRT FP32 profiling trace saved for batch {batch_size}")

    # TRT FP16
    run_dir = f"tb_logs/trt_fp16_bs{batch_size}"
    with profile(
        activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
        record_shapes=True, profile_memory=True,
        schedule=torch.profiler.schedule(wait=1, warmup=1, active=3, repeat=1),
        on_trace_ready=torch.profiler.tensorboard_trace_handler(run_dir),
        with_stack=True
    ) as prof:
        for _ in range(50):
            with record_function("trt_fp16_inference"):
                _ = trt_model_fp16(input_tensor_fp16)
            prof.step()
    torch.cuda.synchronize()
    print(f" TRT FP16 profiling trace saved for batch {batch_size}")


=== Batch size 1 ===
 Timing  (ms):
  PyTorch FP32:    → mean: 4.28 ms  ±  std: 0.68 ms
  TRT FP32:       → mean: 3.17 ms  ±  std: 0.71 ms
  TRT FP16:       → mean: 3.05 ms  ±  std: 0.70 ms
 PyTorch profiling trace saved for batch 1
 TRT FP32 profiling trace saved for batch 1
 TRT FP16 profiling trace saved for batch 1

=== Batch size 8 ===
 Timing  (ms):
  PyTorch FP32:    → mean: 9.17 ms  ±  std: 0.11 ms
  TRT FP32:       → mean: 7.84 ms  ±  std: 0.08 ms
  TRT FP16:       → mean: 4.57 ms  ±  std: 1.06 ms
 PyTorch profiling trace saved for batch 8
 TRT FP32 profiling trace saved for batch 8
 TRT FP16 profiling trace saved for batch 8

=== Batch size 16 ===
 Timing  (ms):
  PyTorch FP32:    → mean: 17.13 ms  ±  std: 0.23 ms
  TRT FP32:       → mean: 14.12 ms  ±  std: 0.15 ms
  TRT FP16:       → mean: 6.22 ms  ±  std: 1.13 ms
 PyTorch profiling trace saved for batch 16
 TRT FP32 profiling trace saved for batch 16
 TRT FP16 profiling trace saved for batch 16

=== Batch size 32 ===
 Timi