## Accelerating Inference

### 1. Fusing Convolution/Linear & BatchNorm

In [1]:
import torch
from torch.nn import functional as F
from torch import nn

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
linear = nn.Linear(3, 4)
bn1d = nn.BatchNorm1d(4)
nn.init.normal_(linear.weight)
nn.init.normal_(linear.bias)
nn.init.normal_(bn1d.weight)
nn.init.normal_(bn1d.bias)
dataset = torch.randn(200, 3)
for i in range(10):
    x = dataset[i*20:(i+1)*20]
    bn1d(linear(x))

linear.weight, linear.bias, bn1d.weight, bn1d.bias, bn1d.running_mean, bn1d.running_var

(Parameter containing:
 tensor([[ 2.5193, -0.2550, -0.3699],
         [-0.0582,  0.7176, -0.4852],
         [ 0.6087, -0.5791,  1.7046],
         [ 1.4373,  0.2971,  0.6564]], requires_grad=True),
 Parameter containing:
 tensor([-1.5657,  0.4022,  1.5948, -1.5223], requires_grad=True),
 Parameter containing:
 tensor([ 0.8613,  1.7043,  1.3866, -0.1287], requires_grad=True),
 Parameter containing:
 tensor([-0.0949,  0.0348,  0.4146, -1.8652], requires_grad=True),
 tensor([-0.9676,  0.1816,  1.1661, -0.9641]),
 tensor([4.9728, 0.8033, 2.8209, 2.1920]))

In [3]:
linear.eval(),bn1d.eval()
x = torch.randn(2, 3)
bn1d(linear(x))

tensor([[-1.1633,  3.8841, -2.0963, -1.6851],
        [-0.8841,  2.8700, -1.7901, -1.6802]],
       grad_fn=<NativeBatchNormBackward0>)

In [4]:
weight = (bn1d.weight*linear.weight.T/(torch.sqrt(bn1d.running_var+bn1d.eps))).T
bias = bn1d.weight*(linear.bias-bn1d.running_mean)/(torch.sqrt(bn1d.running_var+bn1d.eps)) + bn1d.bias
fused_linear = nn.Linear(3, 4)
fused_linear.weight.data.copy_(weight),fused_linear.bias.data.copy_(bias)
fused_linear.eval()
fused_linear(x)

tensor([[-1.1633,  3.8841, -2.0963, -1.6851],
        [-0.8841,  2.8700, -1.7901, -1.6802]], grad_fn=<AddmmBackward0>)

> `torch.allclose(input, other, rtol=1e-05, atol=1e-08, equal_nan=False)`

|input -other| $\le$ atol + rtol $\times$ |other|

In [5]:
torch.allclose(bn1d(linear(x)), fused_linear(x),atol=0,rtol=1e-5)

True

In [6]:
conv = nn.Conv2d(3, 4, 3, 1, 1)
bn2d = nn.BatchNorm2d(4)
nn.init.normal_(conv.weight)
nn.init.normal_(conv.bias)
nn.init.normal_(bn2d.weight)
nn.init.normal_(bn2d.bias)
dataset = torch.randn(200, 3, 10, 10)
for i in range(10):
    x = dataset[i*20:(i+1)*20]
    bn2d(conv(x))

conv.weight, conv.bias, bn2d.weight, bn2d.bias, bn2d.running_mean, bn2d.running_var

(Parameter containing:
 tensor([[[[-0.3755, -1.9969, -1.7692],
           [-0.6517,  0.6055, -0.6112],
           [ 1.2118, -2.6311, -0.8324]],
 
          [[ 0.2619,  1.2034, -1.6152],
           [-1.3033,  0.4315,  1.4261],
           [ 0.6579, -1.7968, -1.5334]],
 
          [[-1.7177,  0.9733,  0.3640],
           [-0.1917, -1.1114,  0.4718],
           [-0.7481, -1.1785, -0.3109]]],
 
 
         [[[ 0.3502,  1.4771,  0.7519],
           [ 0.7774, -0.7319, -1.2932],
           [ 0.2290,  1.0232, -1.4021]],
 
          [[-1.3428, -0.9688, -0.2844],
           [ 0.1926, -0.4981,  0.0641],
           [ 1.8073, -0.2913, -0.8414]],
 
          [[ 0.1951, -0.2776, -0.6255],
           [-0.2391,  1.3289, -0.5593],
           [-1.7043, -1.6380, -1.8700]]],
 
 
         [[[-1.4260,  1.7896, -0.5829],
           [ 1.1172, -0.8984,  2.4433],
           [ 0.7376,  0.9756, -0.4699]],
 
          [[-0.4453, -0.9966, -0.4681],
           [-1.6308, -0.7064,  0.1754],
           [ 1.6604, -0.4524, 

In [7]:
def fuse_conv_bn(conv_weight, conv_bias, bn_weight, bn_bias, bn_running_mean, bn_running_var, bn_eps):
    shape = (bn_weight.shape[0], 1, 1, 1)
    weight = (bn_weight.reshape(shape)*conv_weight /
              (torch.sqrt(bn_running_var.reshape(shape)+bn_eps)))
    bias = bn_weight*(conv_bias-bn_running_mean) / \
        (torch.sqrt(bn_running_var+bn_eps)) + bn_bias
    return weight, bias


In [8]:
conv = conv.double()
bn2d = bn2d.double()
x = torch.randn(2, 3, 10, 10).double()
fused_conv = nn.Conv2d(3, 4, 3, 1, 1).double()
conv.eval(),bn2d.eval(),fused_conv.eval()
weight, bias = fuse_conv_bn(conv.weight, conv.bias, bn2d.weight,
                            bn2d.bias, bn2d.running_mean, bn2d.running_var, bn2d.eps)
fused_conv.weight.data.copy_(weight), fused_conv.bias.data.copy_(bias)
None

In [9]:
torch.allclose(bn2d(conv(x)), fused_conv(x),atol=0,rtol=1e-12)

True

#### `torch.nn.utils.fusion`

In [10]:
from torch.nn.utils import fusion
[k for k in dir(fusion) if not k.startswith("_")]

['copy',
 'fuse_conv_bn_eval',
 'fuse_conv_bn_weights',
 'fuse_linear_bn_eval',
 'fuse_linear_bn_weights',
 'torch']

In [11]:
x1d = torch.randn(2,3)
torch.allclose(bn1d(linear(x1d)), fusion.fuse_linear_bn_eval(linear,bn1d)(x1d),atol=0,rtol=1e-5)

True

In [12]:
x2d = torch.randn(2,3,2,2).double()
torch.allclose(bn2d(conv(x2d)), fusion.fuse_conv_bn_eval(conv,bn2d)(x2d),atol=0,rtol=1e-12)

True

#### `torch.FX`

FX is a toolkit for developers to use to transform `nn.Module` instances. FX consists of three main components: a **symbolic tracer**, an **intermediate representation**, and **Python code generation**.

In [13]:
class WrappedBatchNorm(nn.Module):
    def __init__(self):
        super().__init__()
        self.mod = nn.BatchNorm2d(1)
    def forward(self, x):
        return self.mod(x)

class M(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(1, 1, 1)
        self.bn1 = nn.BatchNorm2d(1)
        self.conv2 = nn.Conv2d(1, 1, 1)
        self.nested = nn.Sequential(
            nn.BatchNorm2d(1),
            nn.Conv2d(1, 1, 1),
        )
        self.wrapped = WrappedBatchNorm()

    def forward(self, x):
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.conv2(x)
        x = self.nested(x)
        x = self.wrapped(x)
        return x

model = M()

model.eval()

M(
  (conv1): Conv2d(1, 1, kernel_size=(1, 1), stride=(1, 1))
  (bn1): BatchNorm2d(1, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (conv2): Conv2d(1, 1, kernel_size=(1, 1), stride=(1, 1))
  (nested): Sequential(
    (0): BatchNorm2d(1, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (1): Conv2d(1, 1, kernel_size=(1, 1), stride=(1, 1))
  )
  (wrapped): WrappedBatchNorm(
    (mod): BatchNorm2d(1, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  )
)

In [14]:
traced_model = torch.fx.symbolic_trace(model)
print(traced_model.graph)
print(traced_model.code)

graph():
    %x : [#users=1] = placeholder[target=x]
    %conv1 : [#users=1] = call_module[target=conv1](args = (%x,), kwargs = {})
    %bn1 : [#users=1] = call_module[target=bn1](args = (%conv1,), kwargs = {})
    %conv2 : [#users=1] = call_module[target=conv2](args = (%bn1,), kwargs = {})
    %nested_0 : [#users=1] = call_module[target=nested.0](args = (%conv2,), kwargs = {})
    %nested_1 : [#users=1] = call_module[target=nested.1](args = (%nested_0,), kwargs = {})
    %wrapped_mod : [#users=1] = call_module[target=wrapped.mod](args = (%nested_1,), kwargs = {})
    return wrapped_mod



def forward(self, x):
    conv1 = self.conv1(x);  x = None
    bn1 = self.bn1(conv1);  conv1 = None
    conv2 = self.conv2(bn1);  bn1 = None
    nested_0 = getattr(self.nested, "0")(conv2);  conv2 = None
    nested_1 = getattr(self.nested, "1")(nested_0);  nested_0 = None
    wrapped_mod = self.wrapped.mod(nested_1);  nested_1 = None
    return wrapped_mod
    


In [15]:
import torch.fx.experimental.optimization
fused_model = torch.fx.experimental.optimization.fuse(model)
fused_model

GraphModule(
  (conv1): Conv2d(1, 1, kernel_size=(1, 1), stride=(1, 1))
  (conv2): Conv2d(1, 1, kernel_size=(1, 1), stride=(1, 1))
  (nested): Module(
    (1): Conv2d(1, 1, kernel_size=(1, 1), stride=(1, 1))
  )
)

In [16]:
x = torch.randn(1, 1, 1, 1)
torch.allclose(fused_model(x), model(x))

True

### 2. `torch.JIT`

TorchScript is a way to create serializable and optimizable models from PyTorch code. Any TorchScript program can be saved from a Python process and loaded in a process where there is no Python dependency.

PyTorch uses dynamic computational graphs.
- more flexible.
- debug friendly.
- generally slower.


![pic](https://github.com/pytorch/pytorch/raw/main/docs/source/_static/img/dynamic_graph.gif)

#### Tracing & Scripting

In [17]:
class MLP(nn.Module):
    def __init__(self, in_features, hidden_features_lits, out_fearures) -> None:
        super().__init__()
        self.net = nn.ModuleList()
        for i in range(len(hidden_features_lits)):
            if i == 0:
                self.net.append(
                    nn.Linear(in_features, hidden_features_lits[i]))
            else:
                self.net.append(
                    nn.Linear(hidden_features_lits[i-1], hidden_features_lits[i]))

        self.net.append(nn.Linear(hidden_features_lits[-1], out_fearures))
        self.activation = 'relu'
    
    def forward(self, x):
        if self.activation == 'relu':
            f = F.relu
        else:
            f = F.tanh
        for layer in self.net:
            x = f(layer(x))
        return x

In [18]:
mlp_model = MLP(3, [4, 5, 6], 7)
x = torch.arange(6).reshape(2, 3).float()
traced_mlp_model = torch.jit.trace(mlp_model, x)
print(traced_mlp_model.code)

def forward(self,
    x: Tensor) -> Tensor:
  net = self.net
  _3 = getattr(net, "3")
  net0 = self.net
  _2 = getattr(net0, "2")
  net1 = self.net
  _1 = getattr(net1, "1")
  net2 = self.net
  _0 = getattr(net2, "0")
  input = torch.relu((_0).forward(x, ))
  input0 = torch.relu((_1).forward(input, ))
  input1 = torch.relu((_2).forward(input0, ))
  return torch.relu((_3).forward(input1, ))



In [19]:
class DecisionGate(nn.Module):
    def forward(self,x):
        if x.sum()>0:
            return x
        else:
            return -x

In [20]:
decision_gate = DecisionGate()
x = torch.arange(6).reshape(2,3).float()
traced_gate = torch.jit.trace(decision_gate, x)
print(traced_gate.code)
traced_gate_negative_x = torch.jit.trace(decision_gate, -x)
print(traced_gate_negative_x.code)

def forward(self,
    x: Tensor) -> Tensor:
  return x

def forward(self,
    x: Tensor) -> Tensor:
  return torch.neg(x)



  if x.sum()>0:


In [21]:
print(decision_gate(-x))
print(traced_gate(-x))

tensor([[0., 1., 2.],
        [3., 4., 5.]])
tensor([[-0., -1., -2.],
        [-3., -4., -5.]])


In [22]:
scripted_gate = torch.jit.script(decision_gate)
print(scripted_gate.code)
scripted_gate(x),scripted_gate(-x)

def forward(self,
    x: Tensor) -> Tensor:
  if bool(torch.gt(torch.sum(x), 0)):
    _0 = x
  else:
    _0 = torch.neg(x)
  return _0



(tensor([[0., 1., 2.],
         [3., 4., 5.]]),
 tensor([[0., 1., 2.],
         [3., 4., 5.]]))

#### Freezing

Freezing a `ScriptModule` will clone it and attempt to inline the cloned module’s submodules, parameters, and attributes as constants in the TorchScript IR Graph. By default, forward will be preserved, as well as attributes & methods specified in preserved_attrs. Additionally, any attribute that is modified within a preserved method will be preserved.

Freezing currently only accepts ScriptModules that are in eval mode.

Freezing applies generic optimization that will speed up your model regardless of machine. To further optimize using server-specific settings, run `optimize_for_inference` after freezing.

In [23]:
len(list(traced_mlp_model.named_parameters()))

8

In [24]:
traced_mlp_model.eval()
freezed_mlp_model = torch.jit.freeze(traced_mlp_model)
len(list(freezed_mlp_model.named_parameters()))

0

In [25]:
print(freezed_mlp_model.code)

def forward(self,
    x: Tensor) -> Tensor:
  input = torch.linear(x, CONSTANTS.c0, CONSTANTS.c1)
  input0 = torch.relu(input)
  input1 = torch.linear(input0, CONSTANTS.c2, CONSTANTS.c3)
  input2 = torch.relu(input1)
  input3 = torch.linear(input2, CONSTANTS.c4, CONSTANTS.c5)
  input4 = torch.relu(input3)
  input5 = torch.linear(input4, CONSTANTS.c6, CONSTANTS.c7)
  return torch.relu(input5)



For an end-to-end example of converting a PyTorch model to TorchScript and running it in C++, see the [Loading a PyTorch Model in C++](https://pytorch.org/tutorials/advanced/cpp_export.html) tutorial.

#### Fuse pointwise operations

Pointwise operations (elementwise addition, multiplication, math functions - sin(), cos(), sigmoid() etc.) can be fused into a single kernel to amortize memory access time and kernel launch time.

PyTorch JIT can fuse kernels automatically, although there could be additional fusion opportunities not yet implemented in the compiler, and not all device types are supported equally.

Pointwise operations are memory-bound, for each operation PyTorch launches a separate kernel. Each kernel loads data from the memory, performs computation (this step is usually inexpensive) and stores results back into the memory.

Fused operator launches only one kernel for multiple fused pointwise ops and loads/stores data only once to the memory. This makes JIT very useful for activation functions, optimizers, custom RNN cells etc.

### 3. GPU specific optimizations

#### Enable cuDNN auto-tuner

NVIDIA cuDNN supports many algorithms to compute a **convolution**. Autotuner runs a short benchmark and selects the kernel with the best performance on a given hardware for a given input size.

For convolutional networks (other types currently not supported), enable cuDNN autotuner before launching the training loop by setting:

```python
torch.backends.cudnn.benchmark = True
```

- the auto-tuner decisions may be non-deterministic; different algorithm may be selected for different runs.
- in some rare cases, such as with highly variable input sizes, it’s better to run convolutional networks with autotuner disabled to avoid the overhead associated with algorithm selection for each input size.

#### Avoid unnecessary CPU-GPU synchronization

Avoid unnecessary synchronizations, to let the CPU run ahead of the accelerator as much as possible to make sure that the accelerator work queue contains many operations.

When possible, avoid operations which require synchronizations, for example:

- `print(cuda_tensor)`
- `cuda_tensor.item()`
- memory copies: `tensor.cuda()`, `cuda_tensor.cpu()` and equivalent `tensor.to(device)` calls
- `cuda_tensor.nonzero()`
- python control flow which depends on results of operations performed on CUDA tensors e.g. `if (cuda_tensor != 0).all()`

#### Create tensors directly on the target device

Instead of calling `torch.rand(size).cuda()` to generate a random tensor, produce the output directly on the target device: `torch.rand(size, device=torch.device('cuda'))`.

This is applicable to all functions which create new tensors and accept device argument: `torch.rand()`, `torch.zeros()`, `torch.full()` and similar.

#### Use mixed precision and AMP

Mixed precision leverages Tensor Cores and offers up to 3x overall speedup on Volta and newer GPU architectures. To use Tensor Cores AMP should be enabled and matrix/tensor dimensions should satisfy requirements for calling kernels that use Tensor Cores.

To use Tensor Cores:
- set sizes to multiples of 8 (to map onto dimensions of Tensor Cores)
  - see [Deep Learning Performance Documentation](https://docs.nvidia.com/deeplearning/performance/index.html#optimizing-performance) for more details and guidelines specific to layer type
  - if layer size is derived from other parameters rather than fixed, it can still be explicitly padded e.g. vocabulary size in NLP models
- enable AMP

##### Automatic Mixed Precision

`torch.cuda.amp` provides convenience methods for mixed precision, where some operations use the `torch.float32`(`float`) datatype and other operations use `torch.float16` (`half`). Some ops, like linear layers and convolutions, are much faster in `float16` or `bfloat16`. Other ops, like reductions, often require the dynamic range of `float32`. Mixed precision tries to match each op to its appropriate datatype, which can reduce your network’s runtime and memory footprint.

Ordinarily, “automatic mixed precision training” uses `torch.autocast` and `torch.cuda.amp.GradScaler` together.

```python
def make_model(in_size, out_size, num_layers):
    layers = []
    for _ in range(num_layers - 1):
        layers.append(torch.nn.Linear(in_size, in_size))
        layers.append(torch.nn.ReLU())
    layers.append(torch.nn.Linear(in_size, out_size))
    return torch.nn.Sequential(*tuple(layers))


batch_size = 512 # Try, for example, 128, 256, 513.
in_size = 4096
out_size = 4096
num_layers = 3
num_batches = 50
epochs = 3

# Creates data in default precision.
# The same data is used for both default and mixed precision trials below.
# You don't need to manually change inputs' ``dtype`` when enabling mixed precision.
data = [torch.randn(batch_size, in_size, device="cuda") for _ in range(num_batches)]
targets = [torch.randn(batch_size, out_size, device="cuda") for _ in range(num_batches)]

loss_fn = torch.nn.MSELoss().cuda()
```

##### Default Precision

Without `torch.cuda.amp`, the following simple network executes all ops in default precision (`torch.float32`):

```python


net = make_model(in_size, out_size, num_layers)
opt = torch.optim.SGD(net.parameters(), lr=0.001)

for epoch in range(epochs):
    for input, target in zip(data, targets):
        output = net(input)
        loss = loss_fn(output, target)
        loss.backward()
        opt.step()
        opt.zero_grad() # set_to_none=True here can modestly improve performance
```

##### Adding torch.autocast

Instances of `torch.autocast` serve as context managers that allow regions of your script to run in mixed precision.

```python
with torch.autocast(device_type='cuda', dtype=torch.float16):
    output = net(input)
    # output is float16 because linear layers ``autocast`` to float16.
    assert output.dtype is torch.float16

    loss = loss_fn(output, target)
    # loss is float32 because ``mse_loss`` layers ``autocast`` to float32.
    assert loss.dtype is torch.float32

# Exits ``autocast`` before backward().
# Backward passes under ``autocast`` are not recommended.
# Backward ops run in the same ``dtype`` ``autocast`` chose for corresponding forward ops.
loss.backward()
opt.step()
opt.zero_grad() # set_to_none=True here can modestly improve performance
```

##### Adding GradScaler

Gradient scaling helps prevent gradients with small magnitudes from flushing to zero (“underflowing”) when training with mixed precision.

torch.cuda.amp.GradScaler performs the steps of gradient scaling conveniently.

```py
scaler = torch.cuda.amp.GradScaler()

with torch.autocast(device_type='cuda', dtype=torch.float16):
    output = net(input)
    loss = loss_fn(output, target)

# Scales loss. Calls ``backward()`` on scaled loss to create scaled gradients.
scaler.scale(loss).backward()

# ``scaler.step()`` first unscales the gradients of the optimizer's assigned parameters.
# If these gradients do not contain ``inf``s or ``NaN``s, optimizer.step() is then called,
# otherwise, optimizer.step() is skipped.
scaler.step(opt)

# Updates the scale for next iteration.
scaler.update()

opt.zero_grad() # set_to_none=True here can modestly improve performance
```

##### Inspecting/modifying gradients (e.g., clipping)

All gradients produced by `scaler.scale(loss).backward()` are scaled. If you wish to modify or inspect the parameters’ `.grad` attributes between `backward()` and `scaler.step(optimizer)`, you should unscale them first using `scaler.unscale_(optimizer)`.

```py
with torch.autocast(device_type='cuda', dtype=torch.float16):
    output = net(input)
    loss = loss_fn(output, target)
scaler.scale(loss).backward()

# Unscales the gradients of optimizer's assigned parameters in-place
scaler.unscale_(opt)

# Since the gradients of optimizer's assigned parameters are now unscaled, clips as usual.
# You may use the same value for max_norm here as you would without gradient scaling.
torch.nn.utils.clip_grad_norm_(net.parameters(), max_norm=0.1)

scaler.step(opt)
scaler.update()
opt.zero_grad() # set_to_none=True here can modestly improve performance
```

#### Preallocate memory in case of variable input length

Models for speech recognition or for NLP are often trained on input tensors with variable sequence length. Variable length can be problematic for PyTorch caching allocator and can lead to reduced performance or to unexpected out-of-memory errors. If a batch with a short sequence length is followed by an another batch with longer sequence length, then PyTorch is forced to release intermediate buffers from previous iteration and to re-allocate new buffers. This process is time consuming and causes fragmentation in the caching allocator which may result in out-of-memory errors.

A typical solution is to implement preallocation. It consists of the following steps:

1. generate a (usually random) batch of inputs with maximum sequence length (either corresponding to max length in the training dataset or to some predefined threshold)

2. execute a forward and a backward pass with the generated batch, do not execute an optimizer or a learning rate scheduler, this step preallocates buffers of maximum size, which can be reused in subsequent training iterations

3. zero out gradients

4. proceed to regular training

### 4. Quantization

Quantization refers to techniques for doing both computations and memory accesses with lower precision data, usually int8 compared to floating point implementations. This enables performance gains in several important areas:

- 4x reduction in model size;
- 2-4x reduction in memory bandwidth;
- 2-4x faster inference due to savings in memory bandwidth and faster compute with int8 arithmetic (the exact speed up varies depending on the hardware, the runtime, and the model).

Fundamentally quantization means introducing approximations and the resulting networks have slightly less accuracy. These techniques attempt to minimize the gap between the full floating point accuracy and the quantized accuracy.

#### Mapping function

The mapping function is what you might guess - a function that maps values from floating-point to integer space. A commonly used mapping function is a linear transformation given by $Q(r) = round(r/S+Z)$, where $r$ is the input and $S$, $Z$ are **quantization parameters**.

To reconvert to floating point space, the inverse function is given by $\tilde{r} = (Q(r) - Z)\cdot S$.

$\tilde{r} \ne r$, and their difference constitutes the quantization error.

#### Quantization Parameters


The mapping function is parameterized by the **scaling factor** $S$ and **zero-point** $Z$ .

$S$ is simply the ratio of the input range to the output range $S = \cfrac{\beta - \alpha}{\beta_q - \alpha_q}$.

where [$\alpha,\beta$] is the clipping range of the input, i.e. the boundaries of permissible inputs. [$\beta_q,\alpha_q$] is the range in quantized output space that it is mapped to. For 8-bit quantization, the output range $\beta_q - \alpha_q \le 2^8 -1$.

$Z$ acts as a bias to ensure that a 0 in the input space maps perfectly to a 0 in the quantized space. $Z = -(\cfrac{\alpha}{S} - \alpha_q)$ 

#### Calibration

The process of choosing the input clipping range is known as calibration. The simplest technique (also the default in PyTorch) is to record the running mininmum and maximum values and assign them to $\alpha$ and $\beta$. 

#### Three Types of Quantization

- dynamic quantization (weights quantized with activations read/stored in floating point and quantized for compute)
- static quantization (weights quantized, activations quantized, calibration required post training)
- static quantization aware training (weights quantized, activations quantized, quantization numerics modeled during training)

##### Dynamic Quantization

```py
# original model
# all tensors and computations are in floating point
previous_layer_fp32 -- linear_fp32 -- activation_fp32 -- next_layer_fp32
                      /
    linear_weight_fp32

# dynamically quantized model
# linear and LSTM weights are in int8
previous_layer_fp32 -- linear_int8_w_fp32_inp -- activation_fp32 -- next_layer_fp32
                      /
    linear_weight_int8

```

In [26]:
from torch.ao import quantization

class lstm_for_demonstration(nn.Module):
  def __init__(self,in_dim,out_dim,depth):
     super(lstm_for_demonstration,self).__init__()
     self.lstm = nn.LSTM(in_dim,out_dim,depth)

  def forward(self,inputs,hidden):
     out,hidden = self.lstm(inputs,hidden)
     return out, hidden

model_dimension=8
sequence_length=20
batch_size=1
lstm_depth=1

# random data for input
inputs = torch.randn(sequence_length,batch_size,model_dimension)
# hidden is actually is a tuple of the initial hidden state and the initial cell state
hidden = (torch.randn(lstm_depth,batch_size,model_dimension), torch.randn(lstm_depth,batch_size,model_dimension))

lstm_model = lstm_for_demonstration(model_dimension,model_dimension,lstm_depth)
quantized_lstm = quantization.quantize_dynamic(
    lstm_model, {nn.LSTM, nn.Linear}, dtype=torch.qint8
)
lstm_model,quantized_lstm

(lstm_for_demonstration(
   (lstm): LSTM(8, 8)
 ),
 lstm_for_demonstration(
   (lstm): DynamicQuantizedLSTM(8, 8)
 ))

In [27]:
import os
def print_size_of_model(model):
    torch.save(model.state_dict(), "temp.p")
    print('Size (B):', os.path.getsize("temp.p"))
    os.remove('temp.p')

print_size_of_model(lstm_model)
print_size_of_model(quantized_lstm)

Size (B): 3661
Size (B): 2573


In [28]:
%timeit lstm_model.forward(inputs, hidden)

1.79 ms ± 166 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [29]:
%timeit quantized_lstm.forward(inputs, hidden)

952 µs ± 67.1 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [30]:
# run the float model
out1, hidden1 = lstm_model(inputs, hidden)
mag1 = torch.mean(abs(out1)).item()
print('mean absolute value of output tensor values in the FP32 model is {0:.5f} '.format(mag1))

# run the quantized model
out2, hidden2 = quantized_lstm(inputs, hidden)
mag2 = torch.mean(abs(out2)).item()
print('mean absolute value of output tensor values in the INT8 model is {0:.5f}'.format(mag2))

# compare them
mag3 = torch.mean(abs(out1-out2)).item()
print('mean absolute value of the difference between the output tensors is {0:.5f} or {1:.2f} percent'.format(mag3,mag3/mag1*100))

mean absolute value of output tensor values in the FP32 model is 0.17307 
mean absolute value of output tensor values in the INT8 model is 0.17307
mean absolute value of the difference between the output tensors is 0.00144 or 0.83 percent


##### Post-training Static Quantization


```py
# original model
# all tensors and computations are in floating point
previous_layer_fp32 -- linear_fp32 -- activation_fp32 -- next_layer_fp32
                    /
    linear_weight_fp32

# statically quantized model
# weights and activations are in int8
previous_layer_int8 -- linear_with_activation_int8 -- next_layer_int8
                      /
    linear_weight_int8

```

In [31]:
# define a floating point model where some layers could be statically quantized
class M(torch.nn.Module):
    def __init__(self):
        super().__init__()
        # QuantStub converts tensors from floating point to quantized
        self.quant = torch.ao.quantization.QuantStub()
        self.conv = torch.nn.Conv2d(1, 1, 1)
        self.relu = torch.nn.ReLU()
        # DeQuantStub converts tensors from quantized to floating point
        self.dequant = torch.ao.quantization.DeQuantStub()

    def forward(self, x):
        # manually specify where tensors will be converted from floating
        # point to quantized in the quantized model
        x = self.quant(x)
        x = self.conv(x)
        x = self.relu(x)
        # manually specify where tensors will be converted from quantized
        # to floating point in the quantized model
        x = self.dequant(x)
        return x

# create a model instance
model_fp32 = M()

# model must be set to eval mode for static quantization logic to work
model_fp32.eval()

# attach a global qconfig, which contains information about what kind
# of observers to attach. Use 'x86' for server inference and 'qnnpack'
# for mobile inference. Other quantization configurations such as selecting
# symmetric or asymmetric quantization and MinMax or L2Norm calibration techniques
# can be specified here.
# Note: the old 'fbgemm' is still available but 'x86' is the recommended default
# for server inference.
# model_fp32.qconfig = torch.ao.quantization.get_default_qconfig('fbgemm')
model_fp32.qconfig = quantization.get_default_qconfig('x86')

# Fuse the activations to preceding layers, where applicable.
# This needs to be done manually depending on the model architecture.
# Common fusions include `conv + relu` and `conv + batchnorm + relu`
model_fp32_fused = quantization.fuse_modules(model_fp32, [['conv', 'relu']])

# Prepare the model for static quantization. This inserts observers in
# the model that will observe activation tensors during calibration.
model_fp32_prepared = quantization.prepare(model_fp32_fused)

# calibrate the prepared model to determine quantization parameters for activations
# in a real world setting, the calibration would be done with a representative dataset
input_fp32 = torch.randn(4, 1, 4, 4)
model_fp32_prepared(input_fp32)

# Convert the observed model to a quantized model. This does several things:
# quantizes the weights, computes and stores the scale and bias value to be
# used with each activation tensor, and replaces key operators with quantized
# implementations.
model_int8 = quantization.convert(model_fp32_prepared)

# run the model, relevant calculations will happen in int8
res = model_int8(input_fp32)



#### Quantization Aware Training for Static Quantization

```py
# original model
# all tensors and computations are in floating point
previous_layer_fp32 -- linear_fp32 -- activation_fp32 -- next_layer_fp32
                      /
    linear_weight_fp32

# model with fake_quants for modeling quantization numerics during training
previous_layer_fp32 -- fq -- linear_fp32 -- activation_fp32 -- fq -- next_layer_fp32
                           /
   linear_weight_fp32 -- fq

# quantized model
# weights and activations are in int8
previous_layer_int8 -- linear_with_activation_int8 -- next_layer_int8
                     /
   linear_weight_int8
```

In [32]:
# define a floating point model where some layers could benefit from QAT
class M(torch.nn.Module):
    def __init__(self):
        super().__init__()
        # QuantStub converts tensors from floating point to quantized
        self.quant = torch.ao.quantization.QuantStub()
        self.conv = torch.nn.Conv2d(1, 1, 1)
        self.bn = torch.nn.BatchNorm2d(1)
        self.relu = torch.nn.ReLU()
        # DeQuantStub converts tensors from quantized to floating point
        self.dequant = torch.ao.quantization.DeQuantStub()

    def forward(self, x):
        x = self.quant(x)
        x = self.conv(x)
        x = self.bn(x)
        x = self.relu(x)
        x = self.dequant(x)
        return x

# create a model instance
model_fp32 = M()

# model must be set to eval for fusion to work
model_fp32.eval()

# attach a global qconfig, which contains information about what kind
# of observers to attach. Use 'x86' for server inference and 'qnnpack'
# for mobile inference. Other quantization configurations such as selecting
# symmetric or asymmetric quantization and MinMax or L2Norm calibration techniques
# can be specified here.
# Note: the old 'fbgemm' is still available but 'x86' is the recommended default
# for server inference.
# model_fp32.qconfig = torch.ao.quantization.get_default_qconfig('fbgemm')
model_fp32.qconfig = torch.ao.quantization.get_default_qat_qconfig('x86')

# fuse the activations to preceding layers, where applicable
# this needs to be done manually depending on the model architecture
model_fp32_fused = torch.ao.quantization.fuse_modules(model_fp32,
    [['conv', 'bn', 'relu']])

# Prepare the model for QAT. This inserts observers and fake_quants in
# the model needs to be set to train for QAT logic to work
# the model that will observe weight and activation tensors during calibration.
model_fp32_prepared = torch.ao.quantization.prepare_qat(model_fp32_fused.train())

def training_loop(model):
    pass

# run the training loop (not shown)
training_loop(model_fp32_prepared)

# Convert the observed model to a quantized model. This does several things:
# quantizes the weights, computes and stores the scale and bias value to be
# used with each activation tensor, fuses modules where appropriate,
# and replaces key operators with quantized implementations.
model_fp32_prepared.eval()
model_int8 = torch.ao.quantization.convert(model_fp32_prepared)

# run the model, relevant calculations will happen in int8
res = model_int8(input_fp32)



#### Model Preparation for Eager Mode Static Quantization

It is necessary to currently make some modifications to the model definition prior to Eager mode quantization. This is because currently quantization works on a module by module basis. Specifically, for all quantization techniques, the user needs to:

1. Convert any operations that require output requantization (and thus have additional parameters) from functionals to module form (for example, using `torch.nn.ReLU` instead of `torch.nn.functional.relu`).

2. Specify which parts of the model need to be quantized either by assigning `.qconfig` attributes on submodules or by specifying `qconfig_mapping`. For example, setting `model.conv1.qconfig = None` means that the `model.conv` layer will not be quantized, and setting `model.linear1.qconfig = custom_qconfig` means that the quantization settings for `model.linear1` will be using `custom_qconfig` instead of the global qconfig.

For static quantization techniques which quantize activations, the user needs to do the following in addition:

1. Specify where activations are quantized and de-quantized. This is done using `QuantStub` and `DeQuantStub` modules.

2. Use `FloatFunctional` to wrap tensor operations that require special handling for quantization into modules. Examples are operations like add and cat which require special handling to determine output quantization parameters.

3. Fuse modules: combine operations/modules into a single module to obtain higher accuracy and performance. This is done using the `fuse_modules()` API, which takes in lists of modules to be fused. We currently support the following fusions: [Conv, Relu], [Conv, BatchNorm], [Conv, BatchNorm, Relu], [Linear, Relu]

### Torch-TensorRT

![Torch-TensorRT](https://developer-blogs.nvidia.com/wp-content/uploads/2021/12/pytorch-torch-tensorrt.png)

[Accelerating Inference Up to 6x Faster in PyTorch with Torch-TensorRT](https://developer.nvidia.com/blog/accelerating-inference-up-to-6x-faster-in-pytorch-with-torch-tensorrt/)

### Reference & Reading List
- [Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift](https://arxiv.org/abs/1502.03167)
- [Performance Tuning Guide](https://pytorch.org/tutorials/recipes/recipes/tuning_guide.html)
- [Computational graphs in PyTorch and TensorFlow](https://towardsdatascience.com/computational-graphs-in-pytorch-and-tensorflow-c25cc40bdcd1)
- [Introduction to Torchscript](https://pytorch.org/tutorials/beginner/Intro_to_TorchScript_tutorial.html)
- [Loading a PyTorch Model in C++](https://pytorch.org/tutorials/advanced/cpp_export.html)
- [TorchScript for Deployment](https://pytorch.org/tutorials/recipes/torchscript_inference.html)
- [Deep Learning Performance Documentation](https://docs.nvidia.com/deeplearning/performance/index.html#optimizing-performance)
- [Automatic Mixed Precision](https://pytorch.org/tutorials/recipes/recipes/amp_recipe.html)
- [Quantization](https://pytorch.org/docs/stable/quantization.html)
- [Introduction to Quantization on PyTorch](https://pytorch.org/blog/introduction-to-quantization-on-pytorch/)
- [Dynamic Quantization](https://pytorch.org/tutorials/recipes/recipes/dynamic_quantization.html)
- [Practical Quantization in PyTorch](https://pytorch.org/blog/quantization-in-practice/)