In [1]:
!pip install "nvidia-modelopt[all]" -U --extra-index-url https://pypi.nvidia.com

Looking in indexes: https://pypi.org/simple, https://pypi.nvidia.com
Collecting nvidia-modelopt[all]
  Downloading https://pypi.nvidia.com/nvidia-modelopt/nvidia_modelopt-0.27.0-py3-none-manylinux_2_28_x86_64.whl (659 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m659.5/659.5 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting nvidia-modelopt-core==0.27.0 (from nvidia-modelopt[all])
  Downloading https://pypi.nvidia.com/nvidia-modelopt-core/nvidia_modelopt_core-0.27.0-cp310-cp310-manylinux_2_28_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m24.1 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting ninja (from nvidia-modelopt[all])
  Downloading ninja-1.11.1.4-py3-none-manylinux_2_12_x86_64.manylinux2010_x86_64.whl.metadata (5.0 kB)
Collecting pydantic>=2.0 (from nvidia-modelopt[all])
  Downloading pydantic-2.11.3-py3-none-any.whl.metadata (65 kB)
[2K     [90m━━━━━━━━━

## Setup

In [1]:
!pip install torch-pruning



In [2]:
import os
import time
import copy
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchvision import datasets, transforms, models
from torch.utils.data import DataLoader
import torch_pruning as tp

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"{device=}")

device=device(type='cuda')


## Get CIFAR-10 train and test sets

In [3]:
transform = transforms.Compose([
    transforms.Resize(32),
    transforms.ToTensor(),
    transforms.Normalize((0.5,), (0.5,))
])

train_loader = DataLoader(
    datasets.CIFAR10(root="./data", train=True, download=True, transform=transform),
    batch_size=128, shuffle=True
)

test_loader = DataLoader(
    datasets.CIFAR10(root="./data", train=False, download=True, transform=transform),
    batch_size=256
)

## Adjust ResNet18 network for CIFAR-10 dataset

In [4]:
def get_resnet18_for_cifar10():
    model = models.resnet18(weights=None, num_classes=10)
    model.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False)
    model.maxpool = nn.Identity()
    return model.to(device)

full_model = get_resnet18_for_cifar10()

## Define Train and Evaluate functions

In [5]:
def train(model, loader, epochs, lr=0.01, save_path="model.pth", silent=False):
    if os.path.exists(save_path):
        if not silent:
            print(f"Model already trained. Loading from {save_path}")
        model.load_state_dict(torch.load(save_path))
        return

    # no saved model found. training from given model state

    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=lr, momentum=0.9)
    model.train()

    for epoch in range(epochs):
        for x, y in loader:
            x, y = x.to(device), y.to(device)
            optimizer.zero_grad()
            loss = criterion(model(x), y)
            loss.backward()
            optimizer.step()
        if not silent:
            print(f"Epoch {epoch+1}: loss={loss.item():.4f}")

    torch.save(model.state_dict(), save_path)
    if not silent:
        print(f"Training complete. Model saved to {save_path}")

In [6]:
def evaluate(model):
    model.eval()
    correct = total = 0
    with torch.no_grad():
        for x, y in test_loader:
            x, y = x.to(device), y.to(device)
            preds = model(x).argmax(1)
            correct += (preds == y).sum().item()
            total += y.size(0)
    return correct / total

## Define helper functions to measure latency

In [7]:
class Timer:
    def __init__(self):
        self.use_cuda = torch.cuda.is_available()
        if self.use_cuda:
            self.starter = torch.cuda.Event(enable_timing=True)
            self.ender = torch.cuda.Event(enable_timing=True)

    def start(self):
        if self.use_cuda:
            self.starter.record()
        else:
            self.start_time = time.time()

    def stop(self):
        if self.use_cuda:
            self.ender.record()
            torch.cuda.synchronize()
            return self.starter.elapsed_time(self.ender)  # ms
        else:
            return (time.time() - self.start_time) * 1000  # ms

In [8]:
def estimate_latency(model, example_inputs, repetitions=50):
    timer = Timer()
    timings = np.zeros((repetitions, 1))

    # warm-up
    for _ in range(5):
        _ = model(example_inputs)

    with torch.no_grad():
        for rep in range(repetitions):
            timer.start()
            _ = model(example_inputs)
            elapsed = timer.stop()
            timings[rep] = elapsed

    return np.mean(timings), np.std(timings)

## Train and Evaluate full model

In [None]:
train(full_model, train_loader, epochs=10, save_path="full_model.pth")
accuracy_full = evaluate(full_model)

example_input = torch.rand(128, 3, 32, 32).to(device)
macs, parameters = tp.utils.count_ops_and_params(full_model, example_input)
latency_mu, latency_std = estimate_latency(full_model, example_input)
print(f"[full model] \t\tMACs: {macs/1e9:.2f} G, \tParameters: {parameters/1e6:.2f} M, \tLatency: {latency_mu:.2f} ± {latency_std:.2f} ms \tAccuracy: {accuracy_full*100:.2f}%")

## Prune by L2 magnitude

In [None]:
# clone full model before pruning
pruned_model = copy.deepcopy(full_model)
pruned_model = pruned_model.to(device)

# set which layers to skip pruning. important to keep final classifier layer
ignored_layers = []
for m in pruned_model.modules():
    if isinstance(m, torch.nn.Linear) and m.out_features == 10:
        ignored_layers.append(m)

In [None]:
# iterative pruning
iterative_steps = 20
pruner = tp.pruner.MagnitudePruner(
    model = pruned_model,
    example_inputs = example_input,
    importance = tp.importance.MagnitudeImportance(p=2),
    pruning_ratio = 1,
    iterative_steps = iterative_steps,
    ignored_layers = ignored_layers,
    round_to = 2,
)

In [None]:
for iter in range(iterative_steps):
    # prune
    pruner.step()
    # evaluate after prune
    acc_before = evaluate(pruned_model)
    # fine-tune pruned model
    train(pruned_model, train_loader, epochs=1, save_path=f"pruned_model_{iter}.pth", silent=True)
    # evaluate after fine-tune
    acc_after = evaluate(pruned_model)
    # count MACs and parameters
    macs, parameters = tp.utils.count_ops_and_params(pruned_model, example_input)
    latency_mu, latency_std = estimate_latency(pruned_model, example_input)
    current_pruning_ratio = 1 / iterative_steps * (iter + 1)
    print(f"[pruned model] \tPruning ratio: {current_pruning_ratio:.2f}, \tMACs: {macs/1e9:.2f} G, \tParameters: {parameters/1e6:.2f} M, \tLatency: {latency_mu:.2f} ± {latency_std:.2f} ms \tAccuracy pruned: {acc_before*100:.2f}%\tFinetuned: {acc_after*100:.2f}%")


## Extra fine-tune last pruned model

In [None]:
train(pruned_model, train_loader, epochs=5, save_path=f"pruned_model_final_tuning.pth")

In [None]:
accuracy_final = evaluate(pruned_model)
print(f"Pruned extra fine-tuned model accuracy: {accuracy_final*100:.2f}%")