### What Post Traing Quantization is...

#### Importing the required libraries 

In [1]:
import torch
import torchvision.datasets as datasets
import torchvision.transforms as transforms
import torch.nn as nn
import matplotlib.pyplot as plt
from tqdm import tqdm
from pathlib import Path
import os

#### Loading the dataset

In [4]:
_ = torch.manual_seed(433)

In [5]:
transform = transforms.Compose([
    transforms.ToTensor(), # converting to tensors
    transforms.Normalize((0.1307,), (0.3081,)) # performing normalization on the data which is optimal in ML or DL
])

# we would be using the MNIST dataset
mnist_trainset = datasets.MNIST(root='./data', train=True, download=True, transform=transform)
mnist_testset = datasets.MNIST(root='./data', train=False, download=True, transform=transform)

# creating batch norm
train_loader = torch.utils.data.DataLoader(mnist_trainset, batch_size=10, shuffle=True)
test_loader = torch.utils.data.DataLoader(mnist_testset, batch_size=10, shuffle=True)

# trying to leverage my baby GPU hahahaha ;)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
Failed to download (trying next):
HTTP Error 403: Forbidden

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-images-idx3-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-images-idx3-ubyte.gz to ./data\MNIST\raw\train-images-idx3-ubyte.gz


100%|███████████████████████████████████████████████████████████████████| 9912422/9912422 [00:01<00:00, 5556356.70it/s]


Extracting ./data\MNIST\raw\train-images-idx3-ubyte.gz to ./data\MNIST\raw

Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
Failed to download (trying next):
HTTP Error 403: Forbidden

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-labels-idx1-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-labels-idx1-ubyte.gz to ./data\MNIST\raw\train-labels-idx1-ubyte.gz


100%|████████████████████████████████████████████████████████████████████████| 28881/28881 [00:00<00:00, 400398.28it/s]


Extracting ./data\MNIST\raw\train-labels-idx1-ubyte.gz to ./data\MNIST\raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz
Failed to download (trying next):
HTTP Error 403: Forbidden

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-images-idx3-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-images-idx3-ubyte.gz to ./data\MNIST\raw\t10k-images-idx3-ubyte.gz


100%|███████████████████████████████████████████████████████████████████| 1648877/1648877 [00:00<00:00, 3320136.78it/s]


Extracting ./data\MNIST\raw\t10k-images-idx3-ubyte.gz to ./data\MNIST\raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz
Failed to download (trying next):
HTTP Error 403: Forbidden

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-labels-idx1-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-labels-idx1-ubyte.gz to ./data\MNIST\raw\t10k-labels-idx1-ubyte.gz


100%|█████████████████████████████████████████████████████████████████████████| 4542/4542 [00:00<00:00, 4554274.15it/s]

Extracting ./data\MNIST\raw\t10k-labels-idx1-ubyte.gz to ./data\MNIST\raw






#### Simple Neural Network 

In [6]:
class NeuralNetwork(nn.Module):
    def __init__(self, hidden_layer_1 = 50,hidden_layer_2 = 80, hidden_layer_3 = 30):
        super(NeuralNetwork,self).__init__()
        self.linear1 = nn.Linear(28*28, hidden_layer_1)
        self.linear2 = nn.Linear(hidden_layer_1, hidden_layer_2)
        self.linear3 = nn.Linear(hidden_layer_2, hidden_layer_3)
        self.linear4 = nn.Linear(hidden_layer_3, 10)
        self.relu = nn.ReLU()
        
    def forward(self,img):
        x = img.view(-1, 28*28)
        x = self.relu(self.linear1(x))
        x = self.relu(self.linear2(x))
        x = self.relu(self.linear3(x))
        x = self.linear4(x)
        return x

model = NeuralNetwork().to(device)

#### Model training

In [7]:
def train(train_loader, model, epochs = None, total_iterations_limit = None):
    # optimizer and loss function
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    loss_function = nn.CrossEntropyLoss() # since this is a classification problem.

    total_iterations = 0  # Keep track of how many total iterations we've done

    for epoch in range(epochs):
        model.train()

        loss_sum = 0  # Sum of all the losses to calculate the average loss
        num_iterations = 0  # Keep track of the iterations in this epoch
        data_iterator = tqdm(train_loader, desc=f'Epoch {epoch+1}')

        if total_iterations_limit is not None:
            data_iterator.total = total_iterations_limit
        for data in data_iterator:
            num_iterations += 1
            total_iterations += 1
            x, y = data # 'data' is a batch (x, y), where x is the input (image), and y is the label (digit)
            x = x.to(device)
            y = y.to(device)
            optimizer.zero_grad()
            output = model(x.view(-1, 28*28))
            loss = loss_function(output, y)
            loss_sum += loss.item()
            avg_loss = loss_sum / num_iterations
            data_iterator.set_postfix(loss=avg_loss)
            loss.backward()
            optimizer.step()

            # If a total iteration limit is set, stop training once the limit is reached
            if total_iterations_limit is not None and total_iterations >= total_iterations_limit:
                return

#### Function to print the size of the model 

In [9]:
def print_size_of_model(model):
    torch.save(model.state_dict(), "temp_delme.p")
    print('Size (KB):', os.path.getsize("temp_delme.p")/1e3)
    os.remove('temp_delme.p')

MODEL_FILENAME = 'simpleNN_ptq.pt'

if Path(MODEL_FILENAME).exists():
    model.load_state_dict(torch.load(MODEL_FILENAME))
    print('Loaded model from disk')
else:
    train(train_loader, model, epochs=1)
    # Save the model to disk
    torch.save(model.state_dict(), MODEL_FILENAME)

Epoch 1: 100%|████████████████████████████████████████████████████████| 6000/6000 [00:58<00:00, 101.70it/s, loss=0.135]


#### Time to test our Neural Network

In [10]:
def test(model, total_iterations):
    correct,total, iterations = 0,0,0

    model.eval()
    with torch.no_grad():
        for data in tqdm(test_loader, desc='Testing'):
            x, y = data
            x = x.to(device)
            y = y.to(device)
            output = model(x.view(-1, 784))
            for idx, i in enumerate(output):
                if torch.argmax(i) == y[idx]:
                    correct +=1
                total +=1
            iterations += 1
            if total_iterations is not None and iterations >= total_iterations:
                break
    print(f'Accuracy: {round(correct/total, 8)}')

#### Looking at the values of the tensors and the size of model before quatization !!!

In [11]:
# Print the weights matrix of the model before quantization
print('Weights before quantization')
print(model.linear1.weight) # for the 1st layer. 
print(model.linear1.weight.dtype)

Weights before quantization
Parameter containing:
tensor([[ 2.2669e-02,  2.1956e-02, -2.6446e-03,  ...,  3.9206e-03,
          3.7860e-02,  4.6699e-03],
        [ 9.6753e-05,  3.8221e-02,  1.9658e-02,  ..., -6.3200e-03,
          2.2407e-02, -1.6425e-02],
        [ 5.9448e-02,  4.3217e-03,  1.6101e-02,  ..., -7.5925e-03,
          7.5968e-03,  2.9488e-02],
        ...,
        [-1.2917e-02,  1.4802e-02,  6.9440e-03,  ...,  3.6022e-04,
         -9.9758e-03,  9.7989e-03],
        [-3.5000e-03,  4.5107e-02,  1.5952e-02,  ...,  3.1508e-02,
          2.7726e-02,  3.5913e-02],
        [-3.3904e-02,  1.7777e-02, -3.4840e-03,  ...,  2.9134e-03,
         -1.7864e-03,  1.2128e-02]], requires_grad=True)
torch.float32


In [12]:
print('Size of the model before quantization')
print_size_of_model(model)

Size of the model before quantization
Size (KB): 187.364


In [14]:
## we also want to check the accuracy of our model 
print(f'Accuracy of the model before quantization: ')
test(model,None)

Accuracy of the model before quantization: 


Testing: 100%|████████████████████████████████████████████████████████████████████| 1000/1000 [00:03<00:00, 291.71it/s]

Accuracy: 0.9646





### Time to Quantize ;)

In [16]:
### We make a copy of that same model: 
class QuantizeNeuralNetwork(nn.Module):
    def __init__(self, hidden_layer_1 = 50,hidden_layer_2 = 80, hidden_layer_3 = 30):
        super(QuantizeNeuralNetwork,self).__init__()
        self.quant = torch.quantization.QuantStub()
        self.linear1 = nn.Linear(28*28, hidden_layer_1)
        self.linear2 = nn.Linear(hidden_layer_1, hidden_layer_2)
        self.linear3 = nn.Linear(hidden_layer_2, hidden_layer_3)
        self.linear4 = nn.Linear(hidden_layer_3, 10)
        self.relu = nn.ReLU()
        self.dequant = torch.quantization.DeQuantStub()
        
    def forward(self,img):
        x = img.view(-1, 28*28)
        x = self.quant(x)
        x = self.relu(self.linear1(x))
        x = self.relu(self.linear2(x))
        x = self.relu(self.linear3(x))
        x = self.linear4(x)
        x = self.dequant(x)
        return x

quant_model = QuantizeNeuralNetwork().to(device)

#### So i am not going to retrain but just copy the weights and get inference for the quantized version... 

In [17]:
quant_model.load_state_dict(model.state_dict())
quant_model.eval() ## we are not training but foing inferencing 

quant_model.qconfig = torch.ao.quantization.default_qconfig
quant_model = torch.ao.quantization.prepare(quant_model) # Insert observers
quant_model

QuantizeNeuralNetwork(
  (quant): QuantStub(
    (activation_post_process): MinMaxObserver(min_val=inf, max_val=-inf)
  )
  (linear1): Linear(
    in_features=784, out_features=50, bias=True
    (activation_post_process): MinMaxObserver(min_val=inf, max_val=-inf)
  )
  (linear2): Linear(
    in_features=50, out_features=80, bias=True
    (activation_post_process): MinMaxObserver(min_val=inf, max_val=-inf)
  )
  (linear3): Linear(
    in_features=80, out_features=30, bias=True
    (activation_post_process): MinMaxObserver(min_val=inf, max_val=-inf)
  )
  (linear4): Linear(
    in_features=30, out_features=10, bias=True
    (activation_post_process): MinMaxObserver(min_val=inf, max_val=-inf)
  )
  (relu): ReLU()
  (dequant): DeQuantStub()
)

In [19]:
test(quant_model,None)
print(f'Check statistics of the various layers')
quant_model

Testing: 100%|████████████████████████████████████████████████████████████████████| 1000/1000 [00:02<00:00, 385.78it/s]

Accuracy: 0.9646
Check statistics of the various layers





QuantizeNeuralNetwork(
  (quant): QuantStub(
    (activation_post_process): MinMaxObserver(min_val=-0.4242129623889923, max_val=2.821486711502075)
  )
  (linear1): Linear(
    in_features=784, out_features=50, bias=True
    (activation_post_process): MinMaxObserver(min_val=-58.53604507446289, max_val=43.8294563293457)
  )
  (linear2): Linear(
    in_features=50, out_features=80, bias=True
    (activation_post_process): MinMaxObserver(min_val=-40.140647888183594, max_val=35.24177551269531)
  )
  (linear3): Linear(
    in_features=80, out_features=30, bias=True
    (activation_post_process): MinMaxObserver(min_val=-36.355369567871094, max_val=43.074493408203125)
  )
  (linear4): Linear(
    in_features=30, out_features=10, bias=True
    (activation_post_process): MinMaxObserver(min_val=-36.072540283203125, max_val=22.88129234313965)
  )
  (relu): ReLU()
  (dequant): DeQuantStub()
)

I think this is beautiful as we can see this values of this tensors. It gives us an Idea for how to go about each layer.

### Quantization using the statistics gathered from the observer!

In [20]:
quant_model = torch.ao.quantization.convert(quant_model)
print(f'Check statistics of the various layers')
quant_model

Check statistics of the various layers


QuantizeNeuralNetwork(
  (quant): Quantize(scale=tensor([0.0256]), zero_point=tensor([17]), dtype=torch.quint8)
  (linear1): QuantizedLinear(in_features=784, out_features=50, scale=0.8060275912284851, zero_point=73, qscheme=torch.per_tensor_affine)
  (linear2): QuantizedLinear(in_features=50, out_features=80, scale=0.5935623645782471, zero_point=68, qscheme=torch.per_tensor_affine)
  (linear3): QuantizedLinear(in_features=80, out_features=30, scale=0.625432014465332, zero_point=58, qscheme=torch.per_tensor_affine)
  (linear4): QuantizedLinear(in_features=30, out_features=10, scale=0.464203417301178, zero_point=78, qscheme=torch.per_tensor_affine)
  (relu): ReLU()
  (dequant): DeQuantize()
)

So for each layer it has it's own Scale and zero point. 

In [21]:
# Print the weights matrix of the model after quantization
print('Weights after quantization')
print(torch.int_repr(quant_model.linear1.weight()))

Weights after quantization
tensor([[ 4,  4,  0,  ...,  1,  6,  1],
        [ 0,  6,  3,  ..., -1,  4, -3],
        [10,  1,  3,  ..., -1,  1,  5],
        ...,
        [-2,  2,  1,  ...,  0, -2,  2],
        [-1,  7,  3,  ...,  5,  5,  6],
        [-6,  3, -1,  ...,  0,  0,  2]], dtype=torch.int8)


In [22]:
print('Original weights: ')
print(model.linear1.weight)
print('')
print(f'Dequantized weights: ')
print(torch.dequantize(quant_model.linear1.weight()))
print('')

Original weights: 
Parameter containing:
tensor([[ 2.2669e-02,  2.1956e-02, -2.6446e-03,  ...,  3.9206e-03,
          3.7860e-02,  4.6699e-03],
        [ 9.6753e-05,  3.8221e-02,  1.9658e-02,  ..., -6.3200e-03,
          2.2407e-02, -1.6425e-02],
        [ 5.9448e-02,  4.3217e-03,  1.6101e-02,  ..., -7.5925e-03,
          7.5968e-03,  2.9488e-02],
        ...,
        [-1.2917e-02,  1.4802e-02,  6.9440e-03,  ...,  3.6022e-04,
         -9.9758e-03,  9.7989e-03],
        [-3.5000e-03,  4.5107e-02,  1.5952e-02,  ...,  3.1508e-02,
          2.7726e-02,  3.5913e-02],
        [-3.3904e-02,  1.7777e-02, -3.4840e-03,  ...,  2.9134e-03,
         -1.7864e-03,  1.2128e-02]], requires_grad=True)

Dequantized weights: 
tensor([[ 0.0245,  0.0245,  0.0000,  ...,  0.0061,  0.0368,  0.0061],
        [ 0.0000,  0.0368,  0.0184,  ..., -0.0061,  0.0245, -0.0184],
        [ 0.0613,  0.0061,  0.0184,  ..., -0.0061,  0.0061,  0.0306],
        ...,
        [-0.0123,  0.0123,  0.0061,  ...,  0.0000, -0.0123,  

#### Lets compare Unquantized and Quantized models

In [24]:
print('Size of the model after quantization')
print_size_of_model(quant_model)
print('Testing the model after quantization')
test(quant_model,None)

Size of the model after quantization
Size (KB): 52.77
Testing the model after quantization


Testing: 100%|████████████████████████████████████████████████████████████████████| 1000/1000 [00:02<00:00, 403.01it/s]

Accuracy: 0.9626





The size has gone down from Size (KB): 187.364 to Size (KB): 52.77 and accuracy has gone from 0.9646 to 0.9626. 