In [1]:
import torch
import torch.nn as nn
import os
import matplotlib.pyplot as plt

from torchvision import datasets
from torchvision import transforms
from tqdm import tqdm
from pathlib import Path


In [3]:
# making torch to be deterministic
_ = torch.manual_seed(0)

# data transformation
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.1307,), (0.3081,))
])

# loading mnist dataset
train_data = datasets.MNIST(root='./data', train=True, download=True, transform=transform)
train_loader = torch.utils.data.DataLoader(train_data, batch_size=10, shuffle=True)
test_data = datasets.MNIST(root='./data', train=False, download=True, transform=transform)
test_loader = torch.utils.data.DataLoader(train_data, batch_size=10, shuffle=True)

# define device 
device = torch.device("cpu" if torch.cuda.is_available() else "cpu")
print(device)

cpu


In [4]:
# Define our Deep Neural Network
class DeepNeuralNetwork(nn.Module):
    def __init__(self, hide_size1=100, hide_size2=100):
        super(DeepNeuralNetwork, self).__init__()
        self.linear1 = nn.Linear(28*28, hide_size1) 
        self.linear2 = nn.Linear(hide_size1, hide_size2) 
        self.linear3 = nn.Linear(hide_size2, 10)
        self.relu = nn.ReLU()

    def forward(self, img):
        x = img.view(-1, 28*28)
        x = self.relu(self.linear1(x))
        x = self.relu(self.linear2(x))
        x = self.linear3(x)
        return x
    
model = DeepNeuralNetwork().to(device)

In [None]:
def train(train_loader, model, epochs=5, max_iter=None):
    entropy = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

    total_iter = 0
    for epoch in range(epochs):
        model.train()

        total_loss = 0
        num_iter = 0

        data_iter = tqdm(train_loader, desc=f'Epoch {epoch+1}')
        if max_iter is not None:
            data_iter.total = max_iter

        for data in data_iter:
            num_iter += 1
            total_iter += 1
            X, y = data
            X = X.to(device)
            y = y.to(device)

            optimizer.zero_grad()
            output = model(X.view(-1, 28*28))
            loss = entropy(output, y)
            total_loss += loss.item()
            avg_loss = total_loss / num_iter
            data_iter.set_postfix(loss=avg_loss)

            loss.backward()
            optimizer.step()

            if max_iter is not None and total_iter >= max_iter:
                return
            


def ModelSize(model):
    torch.save(model.state_dict(), "temp_model.p")
    print(f"Model Size(KB): {os.path.getsize('temp_model.p')/1e3}")
    os.remove('temp_model.p')

Epoch 1: 100%|██████████| 6000/6000 [00:11<00:00, 541.63it/s, loss=0.223]


In [13]:
print(f'Model size before quantization')
ModelSize(model)

Model size before quantization
Model Size(KB): 361.401


In [11]:
MODEL_FILENAME = 'dnn_model.pt'

if Path(MODEL_FILENAME).exists():
    model.load_state_dict(torch.load(MODEL_FILENAME))
    print('Loaded model from disk')
else:
    train(train_loader, model, epochs=1)
    # Save the model to disk
    torch.save(model.state_dict(), MODEL_FILENAME)

Loaded model from disk


In [15]:
# inference testing
def test(model: nn.Module, total_iter: int = None):
    correct = 0
    total = 0
    iter = 0

    model.eval()

    with torch.no_grad():
        for data in tqdm(test_loader, desc="Inference Testing"):
            X, y = data
            X = X.to(device)
            y = y.to(device)
            output = model(X.view(-1, 28*28))
            for idx, i in enumerate(output):
                if torch.argmax(i) == y[idx]:
                    correct += 1
                total += 1
            iter += 1

            if total_iter is not None and iter >= total_iter:
                break
    print(f"Model accuracy : {round(correct / total, 3)}")
            
print(f"Model accuracy before quantization")
test(model=model)

Model accuracy before quantization


Inference Testing: 100%|██████████| 6000/6000 [00:05<00:00, 1170.68it/s]

Model accuracy : 0.966





In [17]:
print(f"Model parameter matrix and its data type before quantization")
print(model.linear1.weight)
print(model.linear1.weight.dtype)

Model parameter matrix and its data type before quantization
Parameter containing:
tensor([[ 0.0056,  0.0250, -0.0236,  ...,  0.0278,  0.0096,  0.0079],
        [-0.0159, -0.0111, -0.0066,  ..., -0.0164, -0.0021, -0.0261],
        [ 0.0232,  0.0583,  0.0101,  ...,  0.0230,  0.0445,  0.0514],
        ...,
        [ 0.0584,  0.0621,  0.0275,  ...,  0.0221,  0.0414,  0.0044],
        [ 0.0020,  0.0101,  0.0421,  ...,  0.0224,  0.0225,  0.0220],
        [ 0.0082,  0.0029, -0.0113,  ...,  0.0251, -0.0242, -0.0040]],
       requires_grad=True)
torch.float32


## Post Training Quantization


In [19]:
# Define our Deep Neural Network
class QuantDeepNeuralNetwork(nn.Module):
    def __init__(self, hide_size1=100, hide_size2=100):
        super(QuantDeepNeuralNetwork, self).__init__()
        self.quant = torch.quantization.QuantStub()
        self.linear1 = nn.Linear(28*28, hide_size1) 
        self.linear2 = nn.Linear(hide_size1, hide_size2) 
        self.linear3 = nn.Linear(hide_size2, 10)
        self.relu = nn.ReLU()
        self.dequant = torch.quantization.DeQuantStub()

    def forward(self, img):
        x = img.view(-1, 28*28)
        x = self.quant(x)
        x = self.relu(self.linear1(x))
        x = self.relu(self.linear2(x))
        x = self.linear3(x)
        x = self.dequant(x)
        return x
    
model = QuantDeepNeuralNetwork().to(device)

In [26]:
# quantized model architecture
quant_model =QuantDeepNeuralNetwork().to(device)
# copy the unquantized model weights to the quantized model architecture
quant_model.load_state_dict(model.state_dict())
quant_model.eval()



QuantDeepNeuralNetwork(
  (quant): QuantStub()
  (linear1): Linear(in_features=784, out_features=100, bias=True)
  (linear2): Linear(in_features=100, out_features=100, bias=True)
  (linear3): Linear(in_features=100, out_features=10, bias=True)
  (relu): ReLU()
  (dequant): DeQuantStub()
)

In [27]:
quant_model.qconfig = torch.ao.quantization.default_qconfig
quant_model = torch.ao.quantization.prepare(quant_model) # Insert observers
quant_model

For migrations of users: 
1. Eager mode quantization (torch.ao.quantization.quantize, torch.ao.quantization.quantize_dynamic), please migrate to use torchao eager mode quantize_ API instead 
2. FX graph mode quantization (torch.ao.quantization.quantize_fx.prepare_fx,torch.ao.quantization.quantize_fx.convert_fx, please migrate to use torchao pt2e quantization API instead (prepare_pt2e, convert_pt2e) 
3. pt2e quantization has been migrated to torchao (https://github.com/pytorch/ao/tree/main/torchao/quantization/pt2e) 
see https://github.com/pytorch/ao/issues/2259 for more details
  quant_model = torch.ao.quantization.prepare(quant_model) # Insert observers


QuantDeepNeuralNetwork(
  (quant): QuantStub(
    (activation_post_process): MinMaxObserver(min_val=inf, max_val=-inf)
  )
  (linear1): Linear(
    in_features=784, out_features=100, bias=True
    (activation_post_process): MinMaxObserver(min_val=inf, max_val=-inf)
  )
  (linear2): Linear(
    in_features=100, out_features=100, bias=True
    (activation_post_process): MinMaxObserver(min_val=inf, max_val=-inf)
  )
  (linear3): Linear(
    in_features=100, out_features=10, bias=True
    (activation_post_process): MinMaxObserver(min_val=inf, max_val=-inf)
  )
  (relu): ReLU()
  (dequant): DeQuantStub()
)

In [28]:
# test the quantized model on testing dataset
test(quant_model)

Inference Testing: 100%|██████████| 6000/6000 [00:04<00:00, 1275.99it/s]

Model accuracy : 0.098





In [29]:
print(f"Statistics of Layers of DNNs")
quant_model

Statistics of Layers of DNNs


QuantDeepNeuralNetwork(
  (quant): QuantStub(
    (activation_post_process): MinMaxObserver(min_val=-0.4242129623889923, max_val=2.821486711502075)
  )
  (linear1): Linear(
    in_features=784, out_features=100, bias=True
    (activation_post_process): MinMaxObserver(min_val=-2.877210855484009, max_val=3.229513168334961)
  )
  (linear2): Linear(
    in_features=100, out_features=100, bias=True
    (activation_post_process): MinMaxObserver(min_val=-1.4892805814743042, max_val=1.450798511505127)
  )
  (linear3): Linear(
    in_features=100, out_features=10, bias=True
    (activation_post_process): MinMaxObserver(min_val=-0.4211212396621704, max_val=0.6218735575675964)
  )
  (relu): ReLU()
  (dequant): DeQuantStub()
)

In [30]:
# quantized model using that statistics we collected 
quant_model = torch.ao.quantization.convert(quant_model)
print(f"Statistics of Layers of DNN")
quant_model

Statistics of Layers of DNN


For migrations of users: 
1. Eager mode quantization (torch.ao.quantization.quantize, torch.ao.quantization.quantize_dynamic), please migrate to use torchao eager mode quantize_ API instead 
2. FX graph mode quantization (torch.ao.quantization.quantize_fx.prepare_fx,torch.ao.quantization.quantize_fx.convert_fx, please migrate to use torchao pt2e quantization API instead (prepare_pt2e, convert_pt2e) 
3. pt2e quantization has been migrated to torchao (https://github.com/pytorch/ao/tree/main/torchao/quantization/pt2e) 
see https://github.com/pytorch/ao/issues/2259 for more details
  quant_model = torch.ao.quantization.convert(quant_model)


QuantDeepNeuralNetwork(
  (quant): Quantize(scale=tensor([0.0256]), zero_point=tensor([17]), dtype=torch.quint8)
  (linear1): QuantizedLinear(in_features=784, out_features=100, scale=0.04808443784713745, zero_point=60, qscheme=torch.per_tensor_affine)
  (linear2): QuantizedLinear(in_features=100, out_features=100, scale=0.023150229826569557, zero_point=64, qscheme=torch.per_tensor_affine)
  (linear3): QuantizedLinear(in_features=100, out_features=10, scale=0.008212557062506676, zero_point=51, qscheme=torch.per_tensor_affine)
  (relu): ReLU()
  (dequant): DeQuantize()
)

In [31]:
# weights of quantized model
print(f"Weights in between layes of quantized model")
print(torch.int_repr(quant_model.linear1.weight()))

Weights in between layes of quantized model
tensor([[ -78,  -45,   72,  ...,  -79,  -20,  -34],
        [  23,  -23,  -64,  ...,   73,  -27,  117],
        [ 118, -122,  -95,  ...,   94,   29,  111],
        ...,
        [  -4,  -73,   84,  ...,  -17, -110,  -55],
        [  71,  -47,  -68,  ...,  -19,  -11,   47],
        [  85,   91,  -85,  ...,  121,   11,   -4]], dtype=torch.int8)


In [32]:
# lets dequantized the quantized model then compare it's layer1 weights
print('Original weights: ')
print(model.linear1.weight)
print('')
print(f'Dequantized weights: ')
print(torch.dequantize(quant_model.linear1.weight()))
print('')

Original weights: 
Parameter containing:
tensor([[-0.0218, -0.0126,  0.0202,  ..., -0.0222, -0.0057, -0.0095],
        [ 0.0065, -0.0066, -0.0178,  ...,  0.0203, -0.0076,  0.0329],
        [ 0.0332, -0.0341, -0.0266,  ...,  0.0263,  0.0081,  0.0312],
        ...,
        [-0.0011, -0.0205,  0.0235,  ..., -0.0048, -0.0308, -0.0155],
        [ 0.0200, -0.0131, -0.0190,  ..., -0.0054, -0.0030,  0.0132],
        [ 0.0237,  0.0254, -0.0239,  ...,  0.0339,  0.0030, -0.0011]],
       requires_grad=True)

Dequantized weights: 
tensor([[-0.0218, -0.0126,  0.0202,  ..., -0.0221, -0.0056, -0.0095],
        [ 0.0064, -0.0064, -0.0179,  ...,  0.0204, -0.0076,  0.0328],
        [ 0.0331, -0.0342, -0.0266,  ...,  0.0263,  0.0081,  0.0311],
        ...,
        [-0.0011, -0.0204,  0.0235,  ..., -0.0048, -0.0308, -0.0154],
        [ 0.0199, -0.0132, -0.0190,  ..., -0.0053, -0.0031,  0.0132],
        [ 0.0238,  0.0255, -0.0238,  ...,  0.0339,  0.0031, -0.0011]])



In [34]:
print(f'Model size after quantization')
ModelSize(quant_model)

Model size after quantization
Model Size(KB): 95.797


In [35]:
print(f"Testing model after quantization")
test(quant_model)

Testing model after quantization


Inference Testing: 100%|██████████| 6000/6000 [00:06<00:00, 991.46it/s] 

Model accuracy : 0.097



