## Quantization

What you can quantize?
- weights (parameters of the neural network)
- activations (intermediate representations or the values that propagate through the network)
- PTQ (post training quantization)

In [1]:
# formula to de-quantize
#r = s(q - z)

# formula to quantize
# q = int(round(r/s + z))

#s: scale, #z: zeropoint
#q = quantized tensor, #r = unquantized tensor

In [2]:
import torch
torch.manual_seed(-1)

<torch._C.Generator at 0x10977f970>

In [3]:
def linear_q_with_scale_zero_point(tensor, scale, zeropoint, dtype=torch.int8):
    scaled_shifted_tensor =  tensor / scale + zeropoint

    rounded_tensor = torch.round(scaled_shifted_tensor)

    q_min, q_max = torch.iinfo(dtype).min, torch.iinfo(dtype).max

    q_tensor = torch.clamp(rounded_tensor, min=q_min, max=q_max).to(dtype)

    return q_tensor

In [22]:
def linear_dq(quantized_tensor, scale, zero_point):
    return scale * (quantized_tensor.float() - zero_point)

In [23]:
test_tensor = torch.randn((3, 3))

In [24]:
test_tensor

tensor([[-1.2638,  1.2812,  2.2326],
        [ 0.3689, -1.3515, -1.6983],
        [ 2.0415,  0.9185,  0.8641]])

In [25]:
#random scale and zero-point values
scale = 0.01 #dataype should be the same as the input tensor
zero_point = -9 #datatype should be same as quantized tensor

In [26]:
quantized_tensor = linear_q_with_scale_zero_point(test_tensor, scale, zero_point)

In [27]:
quantized_tensor

tensor([[-128,  119,  127],
        [  28, -128, -128],
        [ 127,   83,   77]], dtype=torch.int8)

In [28]:
dequanized_tensor = linear_dq(quantized_tensor, scale, zero_point)

In [29]:
dequanized_tensor

tensor([[-1.1900,  1.2800,  1.3600],
        [ 0.3700, -1.1900, -1.1900],
        [ 1.3600,  0.9200,  0.8600]])

In [30]:
test_tensor

tensor([[-1.2638,  1.2812,  2.2326],
        [ 0.3689, -1.3515, -1.6983],
        [ 2.0415,  0.9185,  0.8641]])