# PTQ

In [6]:
import torch
import torch.nn as nn
import torch.quantization as quantization
from torchvision import datasets, transforms
from torch.utils.data import DataLoader

class PTQModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.quant = quantization.QuantStub()
        self.conv1 = nn.Conv2d(1, 16, 3, stride=1, padding=1)
        self.bn1 = nn.BatchNorm2d(16)
        self.relu1 = nn.ReLU()
        self.conv2 = nn.Conv2d(16, 16, 3, stride=1, padding=1)
        self.bn2 = nn.BatchNorm2d(16)
        self.relu2 = nn.ReLU()
        self.fc = nn.Linear(16 * 28 * 28, 10)
        self.dequant = quantization.DeQuantStub()

    def forward(self, x):
        x = self.quant(x)
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu1(x)
        x = self.conv2(x)
        x = self.bn2(x)
        x = self.relu2(x)
        x = x.reshape(x.size(0), -1)
        x = self.fc(x)
        x = self.dequant(x)
        return x

transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5,), (0.5,)),
])

train_dataset = datasets.MNIST(root='./data', train=True, transform=transform, download=True)
test_dataset = datasets.MNIST(root='./data', train=False, transform=transform, download=True)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)


def train_fp32_model(model, dataloader, criterion, optimizer, epochs=10):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for images, labels in dataloader:
            optimizer.zero_grad()
            outputs = model(images)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch [{epoch+1}/{epochs}], Loss: {total_loss / len(dataloader):.4f}")


def evaluate_model(model, dataloader):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for images, labels in dataloader:
            outputs = model(images)
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    accuracy = 100 * correct / total
    print(f"Accuracy: {accuracy:.2f}%")
    return accuracy


def apply_ptq_with_calibration(model, calibration_loader):
    model.eval()
    model.qconfig = quantization.get_default_qconfig("x86")
    model_fused = quantization.fuse_modules(model, [["conv1", "bn1", "relu1"], ["conv2", "bn2", "relu2"]])
    model_prepared = quantization.prepare(model_fused)
    with torch.no_grad():
        for images, _ in calibration_loader:
            model_prepared(images)
    model_int8 = quantization.convert(model_prepared)
    return model_int8


model_fp32 = PTQModel()
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model_fp32.parameters(), lr=0.001)

print("Training FP32 model...")
train_fp32_model(model_fp32, train_loader, criterion, optimizer, epochs=5)

print("Evaluating FP32 model...")
evaluate_model(model_fp32, test_loader)

print("Applying PTQ with calibration...")
calibration_loader = DataLoader(train_dataset, batch_size=64, shuffle=False)
model_int8 = apply_ptq_with_calibration(model_fp32, calibration_loader)

print("Evaluating INT8 model...")
evaluate_model(model_int8, test_loader)

Training FP32 model...
Epoch [1/5], Loss: 0.1524
Epoch [2/5], Loss: 0.0544
Epoch [3/5], Loss: 0.0380
Epoch [4/5], Loss: 0.0279
Epoch [5/5], Loss: 0.0196
Evaluating FP32 model...
Accuracy: 98.39%
Applying PTQ with calibration...
Evaluating INT8 model...
Accuracy: 98.35%


98.35

#QAT

In [5]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms
from torch.utils.data import DataLoader
import torch.ao.quantization as quantization

class QATModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.quant = quantization.QuantStub()
        self.conv1 = nn.Conv2d(1, 16, 3, stride=1, padding=1)
        self.bn1 = nn.BatchNorm2d(16)
        self.relu1 = nn.ReLU()
        self.conv2 = nn.Conv2d(16, 16, 3, stride=1, padding=1)
        self.bn2 = nn.BatchNorm2d(16)
        self.relu2 = nn.ReLU()
        self.fc = nn.Linear(16 * 28 * 28, 10)
        self.dequant = quantization.DeQuantStub()

    def forward(self, x):
        x = self.quant(x)
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu1(x)
        x = self.conv2(x)
        x = self.bn2(x)
        x = self.relu2(x)
        x = x.reshape(x.size(0), -1)
        x = self.fc(x)
        x = self.dequant(x)
        return x

def get_data_loaders(batch_size=64):
    transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.5,), (0.5,))
    ])
    train_dataset = datasets.MNIST(root='./data', train=True, download=True, transform=transform)
    test_dataset = datasets.MNIST(root='./data', train=False, download=True, transform=transform)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
    return train_loader, test_loader

def training_loop(model, train_loader, criterion, optimizer, num_epochs=1):
    model.train()
    for epoch in range(num_epochs):
        running_loss = 0.0
        for images, labels in train_loader:
            optimizer.zero_grad()
            outputs = model(images)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
        print(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {running_loss / len(train_loader):.4f}")

def evaluate_model(model, test_loader):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for images, labels in test_loader:
            outputs = model(images)
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    accuracy = 100 * correct / total
    print(f"Accuracy: {accuracy:.2f}%")

train_loader, test_loader = get_data_loaders()

model_fp32 = QATModel()
model_fp32.qconfig = quantization.get_default_qat_qconfig('x86')

model_fp32.eval()
model_fp32_fused = quantization.fuse_modules(model_fp32,[["conv1", "bn1", "relu1"], ["conv2", "bn2", "relu2"]])

model_fp32_prepared = quantization.prepare_qat(model_fp32_fused.train())

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model_fp32_prepared.parameters(), lr=0.001)

print("Training QAT model...")
training_loop(model_fp32_prepared, train_loader, criterion, optimizer, num_epochs=5)

print("Converting to INT8 model...")
model_fp32_prepared.eval()
model_int8 = quantization.convert(model_fp32_prepared)

print("Evaluating INT8 model...")
evaluate_model(model_int8, test_loader)


Training QAT model...
Epoch [1/5], Loss: 0.2276
Epoch [2/5], Loss: 0.0575
Epoch [3/5], Loss: 0.0399
Epoch [4/5], Loss: 0.0300
Epoch [5/5], Loss: 0.0211
Converting to INT8 model...
Evaluating INT8 model...
Accuracy: 98.32%


양자화 성공적인지 확인

In [None]:
print(model_int8.conv1.weight)

<bound method Conv2d.weight of QuantizedConvReLU2d(1, 16, kernel_size=(3, 3), stride=(1, 1), scale=0.016592519357800484, zero_point=0, padding=(1, 1))>


처음에 시도했다가 오류난 코드

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.quantization as quant
from torchvision import models, transforms
from torch.utils.data import DataLoader
from torchvision.datasets import CIFAR10
import os
import time

transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])

train_dataset = CIFAR10(root="./data", train=True, transform=transform, download=True)
test_dataset = CIFAR10(root="./data", train=False, transform=transform, download=True)

train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True, num_workers=2)
test_loader = DataLoader(test_dataset, batch_size=128, shuffle=False, num_workers=2)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = models.resnet18(pretrained=True)
model.fc = nn.Linear(512, 10)
model = model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

model.train()
model.qconfig = quant.get_default_qat_qconfig("qnnpack")
quant.prepare_qat(model, inplace=True)

for epoch in range(5):
    model.train()
    running_loss = 0.0
    for images, labels in train_loader:
        images, labels = images.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    print(f"QAT Epoch {epoch+1}/5, Loss: {running_loss/len(train_loader):.4f}")


model.eval()
quantized_model = quant.convert(model, inplace=False)

def evaluate(model, dataloader):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for images, labels in dataloader:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    return 100 * correct / total

def measure_inference_time(model, dataloader):
    model.eval()
    start_time = time.time()
    with torch.no_grad():
        for images, _ in dataloader:
            images = images.to(device)
            _ = model(images)
    elapsed_time = time.time() - start_time
    return elapsed_time

def print_model_size(model, label=""):
    torch.save(model.state_dict(), "temp.p")
    size = os.path.getsize("temp.p")
    print(f"Model: {label}\tSize (KB): {size / 1e3:.2f}")
    os.remove("temp.p")
    return size

qat_accuracy = evaluate(quantized_model, test_loader)

qat_time = measure_inference_time(quantized_model, test_loader)

qat_size = print_model_size(quantized_model, "INT8")

print(f"Quantized Model Accuracy: {qat_accuracy:.2f}%")
print(f"Quantized Model Size (KB): {qat_size / 1e3:.2f}")


Files already downloaded and verified
Files already downloaded and verified
QAT Epoch 1/5, Loss: 0.9162
QAT Epoch 2/5, Loss: 0.5894
QAT Epoch 3/5, Loss: 0.4555
QAT Epoch 4/5, Loss: 0.3663
QAT Epoch 5/5, Loss: 0.2898


NotImplementedError: Could not run 'quantized::conv2d.new' with arguments from the 'CUDA' backend. This could be because the operator doesn't exist for this backend, or was omitted during the selective/custom build process (if using custom build). If you are a Facebook employee using PyTorch on mobile, please visit https://fburl.com/ptmfixes for possible resolutions. 'quantized::conv2d.new' is only available for these backends: [Meta, QuantizedCPU, QuantizedCUDA, BackendSelect, Python, FuncTorchDynamicLayerBackMode, Functionalize, Named, Conjugate, Negative, ZeroTensor, ADInplaceOrView, AutogradOther, AutogradCPU, AutogradCUDA, AutogradXLA, AutogradMPS, AutogradXPU, AutogradHPU, AutogradLazy, AutogradMeta, Tracer, AutocastCPU, AutocastXPU, AutocastMPS, AutocastCUDA, FuncTorchBatched, BatchedNestedTensor, FuncTorchVmapMode, Batched, VmapMode, FuncTorchGradWrapper, PythonTLSSnapshot, FuncTorchDynamicLayerFrontMode, PreDispatch, PythonDispatcher].

Meta: registered at ../aten/src/ATen/core/MetaFallbackKernel.cpp:23 [backend fallback]
QuantizedCPU: registered at ../aten/src/ATen/native/quantized/cpu/qconv.cpp:1972 [kernel]
QuantizedCUDA: registered at ../aten/src/ATen/native/quantized/cudnn/Conv.cpp:391 [kernel]
BackendSelect: fallthrough registered at ../aten/src/ATen/core/BackendSelectFallbackKernel.cpp:3 [backend fallback]
Python: registered at ../aten/src/ATen/core/PythonFallbackKernel.cpp:153 [backend fallback]
FuncTorchDynamicLayerBackMode: registered at ../aten/src/ATen/functorch/DynamicLayer.cpp:497 [backend fallback]
Functionalize: registered at ../aten/src/ATen/FunctionalizeFallbackKernel.cpp:349 [backend fallback]
Named: registered at ../aten/src/ATen/core/NamedRegistrations.cpp:7 [backend fallback]
Conjugate: registered at ../aten/src/ATen/ConjugateFallback.cpp:17 [backend fallback]
Negative: registered at ../aten/src/ATen/native/NegateFallback.cpp:18 [backend fallback]
ZeroTensor: registered at ../aten/src/ATen/ZeroTensorFallback.cpp:86 [backend fallback]
ADInplaceOrView: fallthrough registered at ../aten/src/ATen/core/VariableFallbackKernel.cpp:96 [backend fallback]
AutogradOther: registered at ../aten/src/ATen/core/VariableFallbackKernel.cpp:63 [backend fallback]
AutogradCPU: registered at ../aten/src/ATen/core/VariableFallbackKernel.cpp:67 [backend fallback]
AutogradCUDA: registered at ../aten/src/ATen/core/VariableFallbackKernel.cpp:75 [backend fallback]
AutogradXLA: registered at ../aten/src/ATen/core/VariableFallbackKernel.cpp:79 [backend fallback]
AutogradMPS: registered at ../aten/src/ATen/core/VariableFallbackKernel.cpp:87 [backend fallback]
AutogradXPU: registered at ../aten/src/ATen/core/VariableFallbackKernel.cpp:71 [backend fallback]
AutogradHPU: registered at ../aten/src/ATen/core/VariableFallbackKernel.cpp:100 [backend fallback]
AutogradLazy: registered at ../aten/src/ATen/core/VariableFallbackKernel.cpp:83 [backend fallback]
AutogradMeta: registered at ../aten/src/ATen/core/VariableFallbackKernel.cpp:91 [backend fallback]
Tracer: registered at ../torch/csrc/autograd/TraceTypeManual.cpp:294 [backend fallback]
AutocastCPU: fallthrough registered at ../aten/src/ATen/autocast_mode.cpp:321 [backend fallback]
AutocastXPU: fallthrough registered at ../aten/src/ATen/autocast_mode.cpp:463 [backend fallback]
AutocastMPS: fallthrough registered at ../aten/src/ATen/autocast_mode.cpp:209 [backend fallback]
AutocastCUDA: fallthrough registered at ../aten/src/ATen/autocast_mode.cpp:165 [backend fallback]
FuncTorchBatched: registered at ../aten/src/ATen/functorch/LegacyBatchingRegistrations.cpp:731 [backend fallback]
BatchedNestedTensor: registered at ../aten/src/ATen/functorch/LegacyBatchingRegistrations.cpp:758 [backend fallback]
FuncTorchVmapMode: fallthrough registered at ../aten/src/ATen/functorch/VmapModeRegistrations.cpp:27 [backend fallback]
Batched: registered at ../aten/src/ATen/LegacyBatchingRegistrations.cpp:1075 [backend fallback]
VmapMode: fallthrough registered at ../aten/src/ATen/VmapModeRegistrations.cpp:33 [backend fallback]
FuncTorchGradWrapper: registered at ../aten/src/ATen/functorch/TensorWrapper.cpp:207 [backend fallback]
PythonTLSSnapshot: registered at ../aten/src/ATen/core/PythonFallbackKernel.cpp:161 [backend fallback]
FuncTorchDynamicLayerFrontMode: registered at ../aten/src/ATen/functorch/DynamicLayer.cpp:493 [backend fallback]
PreDispatch: registered at ../aten/src/ATen/core/PythonFallbackKernel.cpp:165 [backend fallback]
PythonDispatcher: registered at ../aten/src/ATen/core/PythonFallbackKernel.cpp:157 [backend fallback]
