# Quantize LSTM

In [1]:
from model import PerformanceRNN
import torch
from torch import nn
import distiller
from distiller.modules import DistillerLSTM as LSTM
from distiller.modules import convert_model_to_distiller_lstm
from tqdm import tqdm
import numpy as np

### Loading the model and converting to our own implementation.

In [2]:
assert torch.cuda.is_available()
device = 'cuda:0'
sess_path = "save/LSTM_model.sess"
state = torch.load(sess_path)
rnn_model = PerformanceRNN(**state['model_config']).to(device)
rnn_model.load_state_dict(state['model_state'])

In [3]:
man_model = convert_model_to_distiller_lstm(rnn_model)

In [4]:
man_model.gru

DistillerLSTM(512, 512, num_layers=3, dropout=0.30, bidirectional=False)

Here we convert the pytorch LSTM implementation to our own, by calling `LSTM.from_pytorch_impl`:

### Check that the convertion has succeeded:

In [5]:
rnn_model.eval()

PerformanceRNN(
  (inithid_fc): Linear(in_features=32, out_features=1536, bias=True)
  (inithid_fc_activation): Tanh()
  (event_embedding): Embedding(240, 240)
  (concat_input_fc): Linear(in_features=265, out_features=512, bias=True)
  (concat_input_fc_activation): LeakyReLU(negative_slope=0.1, inplace)
  (gru): DistillerLSTM(512, 512, num_layers=3, dropout=0.30, bidirectional=False)
  (output_fc): Linear(in_features=1536, out_features=240, bias=True)
  (output_fc_activation): Softmax()
)

In [6]:
man_model.eval()

PerformanceRNN(
  (inithid_fc): Linear(in_features=32, out_features=1536, bias=True)
  (inithid_fc_activation): Tanh()
  (event_embedding): Embedding(240, 240)
  (concat_input_fc): Linear(in_features=265, out_features=512, bias=True)
  (concat_input_fc_activation): LeakyReLU(negative_slope=0.1, inplace)
  (gru): DistillerLSTM(512, 512, num_layers=3, dropout=0.30, bidirectional=False)
  (output_fc): Linear(in_features=1536, out_features=240, bias=True)
  (output_fc_activation): Softmax()
)

### Defining the evaluation:

# Quantizing the model:

## Collect activation statistics:

The model uses activation statistics to determine how big the quantization range is. The bigger the range - the larger the round off error after quantization which leads to accuracy drop.  
Our goal is to minimize the range s.t. it contains the absolute most of our data.  
After that, we divide the range into chunks of equal size, according to the number of bits, and transform the data according to this scale factor.  
Read more on scale factor calculation [in our docs](https://nervanasystems.github.io/distiller/algo_quantization.html).

The class `QuantCalibrationStatsCollector` collects the statistics for defining the range $r = max - min$.  

Each forward pass, the collector records the values of inputs and outputs, for each layer:
- absolute over all batches min, max (stored in `min`, `max`)
- average over batches, per batch min, max (stored in `avg_min`, `avg_max`)
- mean
- std
- shape of output tensor  

All these values can be used to define the range of quantization, e.g. we can use the absolute `min`, `max` to define the range.

In [None]:
assert torch.cuda.is_available()
device = 'cuda:0'
sess_path = "save/LSTM_model.sess"
state = torch.load(sess_path)
man_model = PerformanceRNN(**state['model_config']).to(device)
rnn_model.load_state_dict(state['model_state'])

Check that `man_model` has the same weights as `rnn_model`.

In [7]:
man_model.output_fc.weight

Parameter containing:
tensor([[-0.3716,  0.3090,  0.4582,  ...,  0.2778, -0.0749, -0.0093],
        [ 0.0394,  0.3140,  0.2614,  ...,  0.4272, -0.4883, -0.0285],
        [-0.3460,  0.4597, -0.2128,  ...,  0.2678, -0.3428,  0.4470],
        ...,
        [ 0.0919, -0.1194, -0.2238,  ...,  0.5525,  0.1656, -0.0762],
        [-0.1759, -0.2675, -0.3740,  ...,  0.4150, -0.2710,  0.0629],
        [-0.2019,  0.1103, -0.0234,  ...,  0.4642,  0.1504,  0.1126]],
       device='cuda:0', requires_grad=True)

In [8]:
rnn_model.output_fc.weight

Parameter containing:
tensor([[-0.3716,  0.3090,  0.4582,  ...,  0.2778, -0.0749, -0.0093],
        [ 0.0394,  0.3140,  0.2614,  ...,  0.4272, -0.4883, -0.0285],
        [-0.3460,  0.4597, -0.2128,  ...,  0.2678, -0.3428,  0.4470],
        ...,
        [ 0.0919, -0.1194, -0.2238,  ...,  0.5525,  0.1656, -0.0762],
        [-0.1759, -0.2675, -0.3740,  ...,  0.4150, -0.2710,  0.0629],
        [-0.2019,  0.1103, -0.0234,  ...,  0.4642,  0.1504,  0.1126]],
       device='cuda:0', requires_grad=True)

Check that `man_model` is on the GPU.

In [9]:
next(man_model.parameters()).is_cuda

True

In [10]:
# My version.
import os
from distiller.data_loggers import QuantCalibrationStatsCollector, collector_context

# Commented line is probably not necessary.
#man_model = torch.load('./manual.checkpoint.pth.tar')
distiller.utils.assign_layer_fq_names(man_model)
collector = QuantCalibrationStatsCollector(man_model)

# Random numbers.
batch_size = 64
max_len = 100

if not os.path.isfile('performance_rnn_pretrained_stats.yaml'):
    with collector_context(collector) as collector:
        init = torch.randn(batch_size, man_model.init_dim).to(device)
        output = man_model.generate(init, max_len)
        collector.save('performance_rnn_pretrained_stats.yaml')

## Quantize Model:
  
We quantize the model after the training has completed.  
Here we check the baseline model perplexity, to have an idea how good the quantization is.

Now we do our magic - __Quantizing the model__.  
The quantizer replaces the layers in out model with their quantized versions.  
We can see that our model has changed:

In [11]:
from distiller.quantization import PostTrainLinearQuantizer, LinearQuantMode
from copy import deepcopy
# Define the quantizer
quantizer = PostTrainLinearQuantizer(
    deepcopy(man_model),
    model_activation_stats='performance_rnn_pretrained_stats.yaml')

# Quantizer magic:
quantizer.prepare_model()

In [13]:
quantizer.model

PerformanceRNN(
  (inithid_fc): RangeLinearQuantParamLayerWrapper(
    mode=SYMMETRIC, num_bits_acts=8, num_bits_params=8, num_bits_accum=32, clip_acts=NONE, per_channel_wts=False, scale_approx_mult_bits=None
    preset_activation_stats=True
    w_scale=834.6518, w_zero_point=0.0000
    in_scale=38.1417, in_zero_point=0.0000
    out_scale=92.7936, out_zero_point=0.0000
    (wrapped_module): Linear(in_features=32, out_features=1536, bias=True)
  )
  (inithid_fc_activation): Tanh()
  (event_embedding): RangeLinearEmbeddingWrapper(
    (wrapped_module): Embedding(240, 240)
  )
  (concat_input_fc): RangeLinearQuantParamLayerWrapper(
    mode=SYMMETRIC, num_bits_acts=8, num_bits_params=8, num_bits_accum=32, clip_acts=NONE, per_channel_wts=False, scale_approx_mult_bits=None
    preset_activation_stats=True
    w_scale=23.0247, w_zero_point=0.0000
    in_scale=127.0000, in_zero_point=0.0000
    out_scale=47.6283, out_zero_point=0.0000
    (wrapped_module): Linear(in_features=265, out_features