## Basic Setup

In [1]:
!pip install torch_tensorrt
!pip install nvidia-modelopt[all]
!pip install onnx
!pip install onnxruntime-gpu


Collecting onnxruntime-gpu~=1.20.1 (from nvidia-modelopt[all])
  Using cached onnxruntime_gpu-1.20.2-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.6 kB)
Using cached onnxruntime_gpu-1.20.2-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (291.5 MB)
[0mInstalling collected packages: onnxruntime-gpu
[31mERROR: Operation cancelled by user[0m[31m
[0m^C
[31mERROR: Operation cancelled by user[0m[31m
[0m^C
^C


In [4]:
import time
import os
import numpy as np
import torch
import torchvision
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms, models
from torch.quantization import quantize_dynamic
from torch.quantization import default_observer
from torch.ao.quantization import get_default_qconfig, QConfigMapping
from torch.ao.quantization.quantize_fx import prepare_fx, convert_fx
from torch.utils.data import DataLoader, Subset
import torch_tensorrt
import modelopt.torch.quantization as mtq
import onnx

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"{device=}")

ValueError: numpy.dtype size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject

## Get CIFAR-10 train and test sets

In [4]:
transform = transforms.Compose([
    transforms.Resize(32),
    transforms.ToTensor(),
    transforms.Normalize((0.5,), (0.5,))
])


train_dataset = datasets.CIFAR10(root="./data", train=True, download=True, transform=transform)
train_loader = DataLoader(
    datasets.CIFAR10(root="./data", train=True, download=True, transform=transform),
    batch_size=128, shuffle=True
)

test_dataset = datasets.CIFAR10(root="./data", train=False, download=True, transform=transform)
test_loader = DataLoader(
    datasets.CIFAR10(root="./data", train=False, download=True, transform=transform),
    batch_size=128,
    shuffle=False,
    num_workers=2,
    drop_last=True,
)

calibration_dataset = Subset(train_dataset, range(256))
calibration_loader = DataLoader(calibration_dataset, batch_size=128, shuffle=False)

## Adjust ResNet18 network for CIFAR-10 dataset

In [5]:
def get_resnet18_for_cifar10():
    model = models.resnet18(weights=None, num_classes=10)
    model.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False)
    model.maxpool = nn.Identity()
    return model.to(device)

full_model = get_resnet18_for_cifar10()

## Define Train and Evaluate functions

In [6]:
def train(model, loader, epochs, lr=0.01, save_path="model.pth", silent=False):
    if os.path.exists(save_path):
        if not silent:
            print(f"Model already trained. Loading from {save_path}")
        model.load_state_dict(torch.load(save_path))
        return

    # no saved model found. training from given model state

    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=lr, momentum=0.9)
    model.train()

    for epoch in range(epochs):
        for x, y in loader:
            x, y = x.to(device), y.to(device)
            optimizer.zero_grad()
            loss = criterion(model(x), y)
            loss.backward()
            optimizer.step()
        if not silent:
            print(f"Epoch {epoch+1}: loss={loss.item():.4f}")

    torch.save(model.state_dict(), save_path)
    if not silent:
        print(f"Training complete. Model saved to {save_path}")

In [7]:
def evaluate(model, device_str):
    model.eval()
    device = torch.device(device_str)
    model.to(device)
    correct = total = 0
    with torch.no_grad():
        for x, y in test_loader:
            x, y = x.to(device), y.to(device)
            preds = model(x).argmax(1)
            correct += (preds == y).sum().item()
            total += y.size(0)
    return correct / total

## Define helper functions to measure latency

In [8]:
class Timer:
    def __init__(self):
        self.use_cuda = torch.cuda.is_available()
        if self.use_cuda:
            self.starter = torch.cuda.Event(enable_timing=True)
            self.ender = torch.cuda.Event(enable_timing=True)

    def start(self):
        if self.use_cuda:
            self.starter.record()
        else:
            self.start_time = time.time()

    def stop(self):
        if self.use_cuda:
            self.ender.record()
            torch.cuda.synchronize()
            return self.starter.elapsed_time(self.ender)  # ms
        else:
            return (time.time() - self.start_time) * 1000  # ms

In [9]:
def estimate_latency(model, example_inputs, repetitions=50):
    timer = Timer()
    timings = np.zeros((repetitions, 1))

    # warm-up
    for _ in range(5):
        _ = model(example_inputs)

    with torch.no_grad():
        for rep in range(repetitions):
            timer.start()
            _ = model(example_inputs)
            elapsed = timer.stop()
            timings[rep] = elapsed

    return np.mean(timings), np.std(timings)

## Train full model

In [10]:
train(full_model, train_loader, epochs=10, save_path="full_model.pth")


Model already trained. Loading from full_model.pth


## Evaluate full model

In [51]:
# evaluate accuracy
accuracy_full = evaluate(full_model, 'cuda')
print(f"Accuracy (full): {accuracy_full*100:.2f}%")

# get model size
size_mb_full = os.path.getsize("full_model.pth") / 1e6
print(f"Size (full): {size_mb_full:.2f} MB")

# estimate latency on GPU
example_input = torch.rand(128, 3, 32, 32).cuda()
full_model.cuda()
latency_mu_full_gpu, latency_std_full_gpu = estimate_latency(full_model, example_input)
print(f"Latency (full, on gpu): {latency_mu_full_gpu:.2f} ± {latency_std_full_gpu:.2f} ms")

# estimate latency on CPU
example_input = torch.rand(128, 3, 32, 32).cpu()
full_model.cpu()
latency_mu_full_cpu, latency_std_full_cpu = estimate_latency(full_model, example_input)
print(f"Latency (full, on cpu): {latency_mu_full_cpu:.2f} ± {latency_std_full_cpu:.2f} ms")


Accuracy (full): 77.96%
Size (full): 44.78 MB
Latency (full, on gpu): 30.44 ± 0.44 ms
Latency (full, on cpu): 1721.48 ± 208.92 ms


## Apply Quantization for CPU inference

In [12]:
# define qconfig and prepare model for quantization
full_model.cpu()
full_model.eval()
qconfig = get_default_qconfig("fbgemm")
# qconfig = get_default_qconfig('qnnpack')
qconfig_mapping = QConfigMapping().set_global(qconfig)

example_inputs = torch.rand(1, 3, 32, 32).cuda() # Example input for calibration
# prepared_model = prepare_fx(full_model, {"": qconfig}, example_inputs=example_inputs)
prepared_model = prepare_fx(full_model, qconfig_mapping, example_inputs=example_inputs)


# calibrate model with real data
with torch.no_grad():
    for images, _ in calibration_loader:
        prepared_model(images)
        # calibration doesn't need targets, only forward pass

# convert to quantized model
ptq_model = convert_fx(prepared_model)

# save model
torch.save(ptq_model.state_dict(), "ptq_model.pth")




## Evaluate post-training-quantization (PTQ) model

In [13]:
# evaluate accuracy
ptq_model.cpu()
accuracy_ptq = evaluate(ptq_model, 'cpu')
print(f"Accuracy (PTQ): {accuracy_ptq*100:.2f}%")

# get model size
size_mb_ptq = os.path.getsize("ptq_model.pth") / 1e6
print(f"Size (PTQ): {size_mb_ptq:.2f} MB")

# estimate latency
example_input = torch.rand(128, 3, 32, 32)
latency_mu_ptq, latency_std_ptq = estimate_latency(ptq_model, example_input)
print(f"Latency (PTQ, on cpu): {latency_mu_ptq:.2f} ± {latency_std_ptq:.2f} ms")

Accuracy (PTQ): 77.82%
Size (PTQ): 11.30 MB


Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/torch/fx/graph_module.py", line 387, in __call__
    return super(self.cls, obj).__call__(*args, **kwargs)  # type: ignore[misc]
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 1750, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "<eval_with_key>.6", line 7, in forward
    quantize_per_tensor = torch.quantize_per_tensor(x, conv1_input_scale_0, conv1_input_zero_point_0, torch.quint8);  x = conv1_input_scale_0 = conv1_input_zero_point_0 = None
                          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
RuntimeEr

RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu! (when checking argument for argument scale in method wrapper_CUDA_tensor_qparams_quantize_per_tensor)

## Quantize using GPU

In [61]:
# reload full model
full_model = get_resnet18_for_cifar10()
train(full_model, train_loader, epochs=10, save_path="full_model.pth")


Model already trained. Loading from full_model.pth


In [62]:
# define calibration loop
def calibration_loop(model):
    for batch, _ in calibration_loader:
        model(batch.cuda())

In [63]:
# Select quantization config
config = mtq.INT8_SMOOTHQUANT_CFG

# Quantize the model and perform calibration (PTQ)
ptq_model_gpu = mtq.quantize(full_model, config, calibration_loop)

# save model
torch.save(ptq_model_gpu.state_dict(), "ptq_model_gpu.pth")

No module ending with Attention found
Inserted 105 quantizers
Smoothed 1 modules


In [64]:
# save model
torch.save(ptq_model_gpu.state_dict(), "ptq_model_gpu.pth")

In [65]:
# evaluate accuracy
accuracy_ptq_gpu = evaluate(ptq_model_gpu, 'cuda')
print(f"Accuracy (PTQ): {accuracy_ptq_gpu*100:.2f}%")

# get model size
size_mb_ptq_gpu = os.path.getsize("ptq_model_gpu.pth") / 1e6
print(f"Size (PTQ): {size_mb_ptq_gpu:.2f} MB")

# estimate latency
example_input = torch.rand(128, 3, 32, 32).to(device)
latency_mu_ptq_gpu, latency_std_ptq_gpu = estimate_latency(ptq_model_gpu, example_input)
print(f"Latency (PTQ, on gpu): {latency_mu_ptq_gpu:.2f} ± {latency_std_ptq_gpu:.2f} ms")

Accuracy (PTQ): 77.90%
Size (PTQ): 44.82 MB
Latency (PTQ, on gpu): 46.85 ± 3.46 ms


In [66]:
# export to onnx
example_input = torch.rand(128, 3, 32, 32).cuda()
torch.onnx.export(ptq_model_gpu, example_input, "ptq_model_gpu.onnx")

  if not is_torch_export_mode() and len(inputs) == 0:

  assert torch.all(amax >= 0) and not torch.any(torch.isinf(amax)), (



In [67]:
# load onnx model
onnx_model = onnx.load("ptq_model_gpu.onnx")

# check that the model is well formed
onnx.checker.check_model(onnx_model)

In [72]:
import onnxruntime as ort

session = ort.InferenceSession(
    "ptq_model_gpu.onnx",
    providers=["CUDAExecutionProvider"]  # 👈 use GPU
)


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.0.2 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/usr/local/lib/python3.11/dist-packages/colab_kernel_launcher.py", line 37, in <module>
    ColabKernelApp.launch_instance()
  File "/usr/local/lib/python3.11/dist-packages/traitlets/config/application.py", line 992, in launch_instance
    app.start()
  File "/usr/local/lib/python3.11/dist-packages/ipykernel/kernelapp.py", line 712, in start
    self.io_loop.start()
  File "/usr/local/lib/python3.11/dist-package

AttributeError: _ARRAY_API not found

SystemError: <built-in function __import__> returned a result with an exception set

In [70]:
# # evaluate accuracy
# accuracy_ptq_gpu_onnx = evaluate(onnx_model, 'cuda')
# print(f"Accuracy (PTQ): {accuracy_ptq_gpu_onnx*100:.2f}%")

# get model size
size_mb_ptq_gpu_onnx = os.path.getsize("ptq_model_gpu.onnx") / 1e6
print(f"Size (PTQ): {size_mb_ptq_gpu_onnx:.2f} MB")

# # estimate latency
# example_input = torch.rand(128, 3, 32, 32).to(device)
# latency_mu_ptq_gpu_onnx, latency_std_ptq_gpu_onnx = estimate_latency(onnx_model, example_input)
# print(f"Latency (PTQ, on gpu): {latency_mu_ptq_gpu_onnx:.2f} ± {latency_std_ptq_gpu_onnx:.2f} ms")

Size (PTQ): 44.82 MB


In [50]:
# Compress the model
mtq.compress(ptq_model_gpu)

AssertionError: Real quantization not supported for this format.

In [40]:
from modelopt.torch.quantization.utils import export_torch_mode
import torch_tensorrt as torchtrt

example_input = torch.rand(128, 3, 32, 32).to(device)

with torch.no_grad():
    with export_torch_mode():
        # Compile the model with Torch-TensorRT Dynamo backend

        exp_program = torch.export.export(ptq_model, (example_input,), strict=False)
        enabled_precisions = {torch.int8}
        # enabled_precisions = {torch.float8_e4m3fn}
        trt_model = torchtrt.dynamo.compile(
            exp_program,
            inputs=[example_input],
            enabled_precisions=enabled_precisions,
            min_block_size=1,
            debug=False,
        )
        # You can also use torch compile path to compile the model with Torch-TensorRT:
        # trt_model = torch.compile(model, backend="tensorrt")



AssertionError: Cannot convert 0.007874015718698502 to TRT constant

While executing %quantize_op : [num_users=1] = call_function[target=torch.ops.tensorrt.quantize_op.default](args = (%x, %conv1_input_quantizer__amax, 8, 0, False, False), kwargs = {_itensor_to_tensor_meta: {<tensorrt_bindings.tensorrt.ITensor object at 0x7b6399579fb0>: ((128, 3, 32, 32), torch.float32, False, (3072, 1024, 32, 1), torch.contiguous_format, False, {})}})
Original traceback:
File "/usr/local/lib/python3.11/dist-packages/torchvision/models/resnet.py", line 285, in forward
    return self._forward_impl(x)
  File "/usr/local/lib/python3.11/dist-packages/modelopt/torch/quantization/nn/modules/quant_module.py", line 89, in forward
    return super().forward(input, *args, **kwargs)
  File "/usr/local/lib/python3.11/dist-packages/modelopt/torch/quantization/nn/modules/quant_module.py", line 45, in forward
    input = self.input_quantizer(input)
  File "/usr/local/lib/python3.11/dist-packages/modelopt/torch/quantization/nn/modules/tensor_quantizer.py", line 967, in forward
    outputs = self._quant_forward(inputs)
  File "/usr/local/lib/python3.11/dist-packages/modelopt/torch/quantization/tensor_quant.py", line 327, in forward
    outputs = quantize_op(

In [None]:
transform = transforms.Compose([
    transforms.Resize(224),
    transforms.ToTensor(),
])
train_set = datasets.CIFAR10(root='./data', train=True, download=True, transform=transform)
test_set = datasets.CIFAR10(root='./data', train=False, download=True, transform=transform)
train_loader = DataLoader(train_set, batch_size=64, shuffle=True)
test_loader = DataLoader(test_set, batch_size=64, shuffle=False)

100%|██████████| 170M/170M [00:02<00:00, 80.3MB/s]


## Load and adapt MobileNetV2

In [None]:
model_fp32 = models.mobilenet_v2(pretrained=True)
model_fp32.classifier[1] = nn.Linear(model_fp32.last_channel, 10)
model_fp32.eval()


Downloading: "https://download.pytorch.org/models/mobilenet_v2-b0353104.pth" to /root/.cache/torch/hub/checkpoints/mobilenet_v2-b0353104.pth
100%|██████████| 13.6M/13.6M [00:00<00:00, 66.5MB/s]


MobileNetV2(
  (features): Sequential(
    (0): Conv2dNormActivation(
      (0): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
      (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU6(inplace=True)
    )
    (1): InvertedResidual(
      (conv): Sequential(
        (0): Conv2dNormActivation(
          (0): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=32, bias=False)
          (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (2): ReLU6(inplace=True)
        )
        (1): Conv2d(32, 16, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (2): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
    )
    (2): InvertedResidual(
      (conv): Sequential(
        (0): Conv2dNormActivation(
          (0): Conv2d(16, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (1): BatchNorm2d(96, eps=

## Apply dynamic quantization to Linear layers


In [None]:
model_int8 = quantize_dynamic(model_fp32, {nn.Linear}, dtype=torch.qint8)


## Evaluate

In [None]:
# Evaluate function
@torch.no_grad()
def evaluate(model, dataloader):
    model.eval()
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    correct = 0
    total = 0
    for inputs, labels in dataloader:
        inputs, labels = inputs.to(device), labels.to(device)
        outputs = model(inputs)
        _, predicted = outputs.max(1)
        correct += predicted.eq(labels).sum().item()
        total += labels.size(0)
    return correct / total

# Run evaluation
acc_original = evaluate(model_fp32, test_loader)
print(f"Original Model Accuracy: {acc_original*100:.2f}%")

acc = evaluate(model_int8, test_loader)
print(f"PTQ Quantized Model Accuracy: {acc*100:.2f}%")

PTQ Quantized Model Accuracy: 0.1033


In [None]:
acc_original = evaluate(model_fp32, test_loader)
print(f"Original Model Accuracy: {acc_original:.4f}")


Original Model Accuracy: 0.1030


## Measure inference time and model size

In [None]:
def measure_latency(model, device, input_shape=(1, 3, 224, 224), runs=100):
    model.eval()
    dummy_input = torch.randn(input_shape).to(device)
    with torch.no_grad():
        start = time.time()
        for _ in range(runs):
            model(dummy_input)
        end = time.time()
    return (end - start) / runs

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


latency_org = measure_latency(model_fp32, device)
print(f"Average inference latency (org): {latency_org * 1000:.2f} ms")

latency = measure_latency(model_int8, device)
print(f"Average inference latency (PTQ): {latency * 1000:.2f} ms")




Average inference latency (org): 33.34 ms
Average inference latency (PTQ): 33.33 ms


## Save and report model size


In [None]:
# Save and report model size
torch.save(model_int8.state_dict(), "ptq_model.pth")
size_mb = os.path.getsize("ptq_model.pth") / 1e6
print(f"Model size (PTQ): {size_mb:.2f} MB")


Model size (PTQ): 9.15 MB


In [None]:
torch.save(model_fp32.state_dict(), "org_model.pth")
size_mb = os.path.getsize("org_model.pth") / 1e6
print(f"Model size (org): {size_mb:.2f} MB")

Model size (org): 9.19 MB
