# Quantization
This notebook acts as an example of how to use the quantization techniques.

## Setup
* Import the necessary packages.
* Load a model.
* Load a dataset.
* Analyze performance of the model prior to quantization.

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import torch
import importlib
import inspect
import sys
import torchvision.datasets as datasets
import torchvision.transforms as transforms
import torch.nn.functional as F

# Add thesis package to path
sys.path.append("../")

import src.general as general
import src.metrics as metrics
import src.evaluation as eval
import src.compression.quantization as quant


In [3]:
model_state = "../models/mnist.pt"
model_class = "models.mnist"

# Import the module classes
module = importlib.import_module(model_class)
classes = general.get_module_classes(module)
for cls in classes:
    globals()[cls.__name__] = cls

# Get device
device = general.get_device()

# Load the model
model = torch.load(model_state, map_location=torch.device(device))

Using cuda: False


In [4]:
# Load MNIST dataset
batch_size = 64
test_batch_size = 1000
use_cuda = False

kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {}
mnist_transform = transforms.ToTensor()
train_loader = torch.utils.data.DataLoader(
    datasets.MNIST('../data', train=True, download=True, transform=mnist_transform,),
    batch_size=batch_size, shuffle=True, **kwargs)
test_loader = torch.utils.data.DataLoader(
    datasets.MNIST('../data', train=True, download=True, transform=mnist_transform,),
    batch_size=test_batch_size, shuffle=True, **kwargs)

In [5]:
def print_metrics(loss, score, batch_duration, data_duration, batch_size, model):
    print("Loss: {:.6f}".format(loss))
    print("Score: {:.6f}".format(score))
    print("Time per batch: {:.4f} ms ({} per batch)".format(batch_duration, batch_size))
    print("Time per data point: {:.4f} ms".format(data_duration))
    params = eval.get_model_parameters(model)
    print('Number of parameters: {}'.format(params))
    model_size = eval.get_model_size(model)
    print('Model Size: {} MB'.format(model_size))

### Pre-Quantization Evaluation

In [6]:
criterion = F.nll_loss

loss, score, duration, batch_duration, data_duration = general.test(model, device, test_loader, criterion, metric=metrics.accuracy)

Test: 100%|██████████| 60/60 [00:04<00:00, 13.40it/s]

Average loss = 0.0768
Accuracy = 0.9770
Elapsed time = 4494.62 milliseconds (74.91 per batch, 0.07 per data point)





In [7]:
print_metrics(loss, score, batch_duration, data_duration, test_batch_size, model)

Loss: 0.076818
Score: 0.976983
Time per batch: 74.9103 ms (1000 per batch)
Time per data point: 0.0749 ms
Number of parameters: 431080
Model Size: 1.65 MB


## Quantization

### Dynamic Quantization
Here the model’s weights are pre-quantized; the activations are quantized on-the-fly (“dynamic”) during inference. 

Currently only Linear and Recurrent (LSTM, GRU, RNN) layers are supported for dynamic quantization.

In [8]:
dynamic_quantized_model = quant.dynamic_quantization(model)



In [9]:
loss, score, duration, batch_duration, data_duration = general.test(dynamic_quantized_model, device, test_loader, criterion, metric=metrics.accuracy)
print_metrics(loss, score, batch_duration, data_duration, test_batch_size, dynamic_quantized_model)

Test: 100%|██████████| 60/60 [00:04<00:00, 13.77it/s]

Average loss = 0.0766
Accuracy = 0.9770
Elapsed time = 4357.64 milliseconds (72.63 per batch, 0.07 per data point)
Loss: 0.076646
Score: 0.976983
Time per batch: 72.6273 ms (1000 per batch)
Time per data point: 0.0726 ms
Number of parameters: 25570
Model Size: 0.49 MB





### Static Quantization
Post Training Static Quantization (PTQ) also pre-quantizes model weights but instead of calibrating activations on-the-fly, the clipping range is pre-calibrated and fixed (“static”) using validation data.

In [18]:
static_quantized_model = quant.static_quantization(model, device, train_loader)

QConfig(activation=functools.partial(<class 'torch.ao.quantization.observer.MinMaxObserver'>, quant_min=0, quant_max=127){}, weight=functools.partial(<class 'torch.ao.quantization.observer.MinMaxObserver'>, dtype=torch.qint8, qscheme=torch.per_tensor_symmetric){})


Test: 100%|██████████| 938/938 [00:06<00:00, 138.82it/s]

Average loss = 0.0769
Elapsed time = 6758.17 milliseconds (7.20 per batch, 0.23 per data point)
Post Training Quantization: Calibration done





In [None]:
loss, score, duration, batch_duration, data_duration = general.test(static_quantized_model, device, test_loader, criterion, metric=metrics.accuracy)
print_metrics(loss, score, batch_duration, data_duration, test_batch_size, static_quantized_model)