# Quantize GRU

In [1]:
from model import PerformanceRNN
import torch
from torch import nn
import distiller
from distiller.modules.gru import DistillerGRU as GRU
from distiller.modules.gru import convert_model_to_distiller_gru
from tqdm import tqdm
import numpy as np

### Loading the model and converting to our own implementation.

In [2]:
assert torch.cuda.is_available()
device = 'cuda:0'
sess_path = "save/ecomp_w500.sess"
state = torch.load(sess_path)
rnn_model = PerformanceRNN(**state['model_config']).to(device)
rnn_model.load_state_dict(state['model_state'])

In [3]:
convert_model_to_distiller_gru(rnn_model)

PerformanceRNN(
  (inithid_fc): Linear(in_features=32, out_features=1536, bias=True)
  (inithid_fc_activation): Tanh()
  (event_embedding): Embedding(240, 240)
  (concat_input_fc): Linear(in_features=265, out_features=512, bias=True)
  (concat_input_fc_activation): LeakyReLU(negative_slope=0.1, inplace)
  (gru): DistillerGRU(512, 512, num_layers=3, dropout=0.30, bidirectional=False)
  (output_fc): Linear(in_features=1536, out_features=240, bias=True)
  (output_fc_activation): Softmax()
)

In [None]:
man_model.gru

Check that man_model is on GRU.

In [None]:
next(man_model.parameters()).is_cuda

### Check that the conversion has succeeded:

In [None]:
rnn_model.eval()

In [None]:
man_model.eval()

Test to make sure that both the original and manual models can generate output.

In [12]:
model = quantizer.model.to(device)
model.eval()
batch_size = 1
init = torch.randn(batch_size, model.init_dim).to(device)
max_len = 1000
controls=None
greedy_ratio = 0.7
temperature = 1.0

import pdb

with torch.no_grad():
    #pdb.set_trace()
    outputs = model.generate(init, max_len,
                             controls=controls,
                             greedy=greedy_ratio,
                             temperature=temperature,
                             verbose=True)
    

outputs = outputs.cpu().numpy().T # [batch, steps]

<class 'torch.Tensor'>


Convert output to MIDI and save.

In [13]:
import utils
import os

output_dir = "quantized_output/"
os.makedirs(output_dir, exist_ok=True)

for i, output in enumerate(outputs):
    name = f'output-{i:03d}.mid'
    path = os.path.join(output_dir, name)
    n_notes = utils.event_indeces_to_midi_file(output, path)
    print(f'===> {path} ({n_notes} notes)')

===> quantized_output/output-000.mid (122 notes)


### Defining the evaluation:

# Quantizing the model:

## Collect activation statistics:

The model uses activation statistics to determine how big the quantization range is. The bigger the range - the larger the round off error after quantization which leads to accuracy drop.  
Our goal is to minimize the range s.t. it contains the absolute most of our data.  
After that, we divide the range into chunks of equal size, according to the number of bits, and transform the data according to this scale factor.  
Read more on scale factor calculation [in our docs](https://nervanasystems.github.io/distiller/algo_quantization.html).

The class `QuantCalibrationStatsCollector` collects the statistics for defining the range $r = max - min$.  

Each forward pass, the collector records the values of inputs and outputs, for each layer:
- absolute over all batches min, max (stored in `min`, `max`)
- average over batches, per batch min, max (stored in `avg_min`, `avg_max`)
- mean
- std
- shape of output tensor  

All these values can be used to define the range of quantization, e.g. we can use the absolute `min`, `max` to define the range.

Check that `man_model` has the same weights as `rnn_model` (Warning: Running this will move the models to the CPU).

In [None]:
import numpy.testing as nptest

man_model_weights = man_model.cpu().output_fc.weight.detach().numpy()
rnn_model_weights = rnn_model.cpu().output_fc.weight.detach().numpy()
nptest.assert_array_almost_equal(man_model_weights, rnn_model_weights)

Check that `man_model` is on the GPU.

In [None]:
next(man_model.parameters()).is_cuda

In [6]:
# My version.
import os
from distiller.data_loggers import QuantCalibrationStatsCollector, collector_context

# Commented line is probably not necessary.
#man_model = torch.load('./manual.checkpoint.pth.tar')
distiller.utils.assign_layer_fq_names(rnn_model)
collector = QuantCalibrationStatsCollector(rnn_model)#,
                                           #inplace_runtime_check=True,
                                           #disable_inplace_attrs=True) # I added this last argument.

# Random numbers.
batch_size = 64
max_len = 100

if not os.path.isfile('performance_rnn_pretrained_stats.yaml'):
    with collector_context(collector) as collector:
        init = torch.randn(batch_size, rnn_model.init_dim).to(device)
        output = rnn_model.generate(init, max_len)
        collector.save('performance_rnn_pretrained_stats.yaml')

<class 'torch.Tensor'>


## Quantize Model:
  
We quantize the model after the training has completed.  
Here we check the baseline model perplexity, to have an idea how good the quantization is.

Now we do our magic - __Quantizing the model__.  
The quantizer replaces the layers in out model with their quantized versions.  
We can see that our model has changed:

In [7]:
from distiller.quantization import PostTrainLinearQuantizer, LinearQuantMode
from copy import deepcopy
# Define the quantizer
quantizer = PostTrainLinearQuantizer(
    deepcopy(rnn_model),
    model_activation_stats='performance_rnn_pretrained_stats.yaml')

# Quantizer magic:

quantizer.prepare_model()

In [8]:
quantizer.model

PerformanceRNN(
  (inithid_fc): RangeLinearQuantParamLayerWrapper(
    mode=SYMMETRIC, num_bits_acts=8, num_bits_params=8, num_bits_accum=32, clip_acts=NONE, per_channel_wts=False, scale_approx_mult_bits=None
    preset_activation_stats=True
    w_scale=520.7937, w_zero_point=0.0000
    in_scale=35.3958, in_zero_point=0.0000
    out_scale=28.3010, out_zero_point=0.0000
    (wrapped_module): Linear(in_features=32, out_features=1536, bias=True)
  )
  (inithid_fc_activation): Tanh()
  (event_embedding): RangeLinearEmbeddingWrapper(
    (wrapped_module): Embedding(240, 240)
  )
  (concat_input_fc): RangeLinearQuantParamLayerWrapper(
    mode=SYMMETRIC, num_bits_acts=8, num_bits_params=8, num_bits_accum=32, clip_acts=NONE, per_channel_wts=False, scale_approx_mult_bits=None
    preset_activation_stats=True
    w_scale=37.3165, w_zero_point=0.0000
    in_scale=127.0000, in_zero_point=0.0000
    out_scale=95.1984, out_zero_point=0.0000
    (wrapped_module): Linear(in_features=265, out_features

In [None]:
criterion = nn.CrossEntropyLoss()
def repackage_hidden(h):
    """Wraps hidden states in new Tensors, to detach them from their history."""
    if isinstance(h, torch.Tensor):
        return h.detach()
    else:
        return tuple(repackage_hidden(v) for v in h)
    

def evaluate(model):
    # Turn on evaluation mode which disables dropout.
    model.eval()
    total_loss = 0.
    ntokens = len(corpus.dictionary)
    hidden = model.init_hidden(eval_batch_size)
    with torch.no_grad():
        # The line below was fixed as per: https://github.com/pytorch/examples/issues/214
        for i in tqdm(range(0, data_source.size(0), sequence_len)):
            data, targets = get_batch(data_source, i)
            output, hidden = model(data, hidden)
            output_flat = output.view(-1, ntokens)
            total_loss += len(data) * criterion(output_flat, targets).item()
            hidden = repackage_hidden(hidden)
    return total_loss / len(data_source)

In [None]:
evaluate(quantizer.model.to(device), val_data)

## Evaluate the perplexity of the original and quantized models.

In [9]:
from data import Dataset
from sequence import EventSeq

data_path = "dataset/processed/ecomp_piano"
dataset = Dataset(data_path, verbose=True)
dataset_size = len(dataset.samples)
assert dataset_size > 0

# Eventually need to put these in YAML file.
controls = None
teacher_forcing_ratio = 1.0
loss_function = nn.CrossEntropyLoss()
window_size = 200
stride_size = 10
use_transposition = False
control_ratio = 1.0
event_dim = EventSeq.dim()

batch_gen = dataset.batches(batch_size, window_size, stride_size)

In [10]:
for iteration, (events, controls) in enumerate(batch_gen):
    if use_transposition:
        offset = np.random.choice(np.arange(-6, 6))
        events, controls = utils.transposition(events, controls, offset)

    events = torch.LongTensor(events).to(device)
    assert events.shape[0] == window_size

    if np.random.random() < control_ratio:
        controls = torch.FloatTensor(controls).to(device)
        assert controls.shape[0] == window_size
    else:
        controls = None

    init = torch.randn(batch_size, model.init_dim).to(device)
    outputs = model.generate(init, window_size, events=events[:-1], controls=controls,
                             teacher_forcing_ratio=teacher_forcing_ratio, output_type='logit')
    assert outputs.shape[:2] == events.shape[:2]

    loss = loss_function(outputs.view(-1, event_dim), events.view(-1))
    print(loss)
    #model.zero_grad()
    #loss.backward()

#    norm = utils.compute_gradient_norm(model.parameters())
#    nn.utils.clip_grad_norm_(model.parameters(), 1.0)
    
    #optimizer.step()

#    if enable_logging:
#        writer.add_scalar('model/loss', loss.item(), iteration)
#        writer.add_scalar('model/norm', norm.item(), iteration)

    print(f'iter {iteration}, loss: {loss.item()}')

#    if time.time() - last_saving_time > saving_interval:
#        save_model()
#        last_saving_time = time.time()

<class 'torch.Tensor'>
tensor(2.1714, device='cuda:0', grad_fn=<NllLossBackward>)
iter 0, loss: 2.171393394470215
<class 'torch.Tensor'>
tensor(2.1823, device='cuda:0', grad_fn=<NllLossBackward>)
iter 1, loss: 2.1822714805603027
<class 'torch.Tensor'>
tensor(2.1459, device='cuda:0', grad_fn=<NllLossBackward>)
iter 2, loss: 2.1458685398101807
<class 'torch.Tensor'>
tensor(2.2377, device='cuda:0', grad_fn=<NllLossBackward>)
iter 3, loss: 2.2376677989959717
<class 'torch.Tensor'>
tensor(2.1788, device='cuda:0', grad_fn=<NllLossBackward>)
iter 4, loss: 2.1788384914398193
<class 'torch.Tensor'>
tensor(2.2040, device='cuda:0', grad_fn=<NllLossBackward>)
iter 5, loss: 2.203986406326294
<class 'torch.Tensor'>
tensor(2.1233, device='cuda:0', grad_fn=<NllLossBackward>)
iter 6, loss: 2.1233153343200684
<class 'torch.Tensor'>
tensor(2.1776, device='cuda:0', grad_fn=<NllLossBackward>)
iter 7, loss: 2.177565574645996
<class 'torch.Tensor'>
tensor(2.1893, device='cuda:0', grad_fn=<NllLossBackward>)
i

<class 'torch.Tensor'>
tensor(2.1756, device='cuda:0', grad_fn=<NllLossBackward>)
iter 71, loss: 2.1756279468536377
<class 'torch.Tensor'>
tensor(2.1944, device='cuda:0', grad_fn=<NllLossBackward>)
iter 72, loss: 2.1943514347076416
<class 'torch.Tensor'>
tensor(2.2062, device='cuda:0', grad_fn=<NllLossBackward>)
iter 73, loss: 2.206209659576416
<class 'torch.Tensor'>
tensor(2.2105, device='cuda:0', grad_fn=<NllLossBackward>)
iter 74, loss: 2.210500717163086
<class 'torch.Tensor'>
tensor(2.1799, device='cuda:0', grad_fn=<NllLossBackward>)
iter 75, loss: 2.17990779876709
<class 'torch.Tensor'>
tensor(2.2082, device='cuda:0', grad_fn=<NllLossBackward>)
iter 76, loss: 2.2082276344299316
<class 'torch.Tensor'>
tensor(2.2239, device='cuda:0', grad_fn=<NllLossBackward>)
iter 77, loss: 2.2238588333129883
<class 'torch.Tensor'>
tensor(2.1077, device='cuda:0', grad_fn=<NllLossBackward>)
iter 78, loss: 2.1077404022216797
<class 'torch.Tensor'>
tensor(2.1045, device='cuda:0', grad_fn=<NllLossBackw

<class 'torch.Tensor'>
tensor(2.1790, device='cuda:0', grad_fn=<NllLossBackward>)
iter 142, loss: 2.179039716720581
<class 'torch.Tensor'>
tensor(2.1725, device='cuda:0', grad_fn=<NllLossBackward>)
iter 143, loss: 2.1725101470947266
<class 'torch.Tensor'>
tensor(2.1795, device='cuda:0', grad_fn=<NllLossBackward>)
iter 144, loss: 2.179532527923584
<class 'torch.Tensor'>
tensor(2.2120, device='cuda:0', grad_fn=<NllLossBackward>)
iter 145, loss: 2.212005615234375
<class 'torch.Tensor'>
tensor(2.2407, device='cuda:0', grad_fn=<NllLossBackward>)
iter 146, loss: 2.2406809329986572
<class 'torch.Tensor'>
tensor(2.1948, device='cuda:0', grad_fn=<NllLossBackward>)
iter 147, loss: 2.1947851181030273
<class 'torch.Tensor'>
tensor(2.2315, device='cuda:0', grad_fn=<NllLossBackward>)
iter 148, loss: 2.231478214263916
<class 'torch.Tensor'>
tensor(2.1808, device='cuda:0', grad_fn=<NllLossBackward>)
iter 149, loss: 2.180799722671509
<class 'torch.Tensor'>
tensor(2.1546, device='cuda:0', grad_fn=<NllLo

KeyboardInterrupt: 

In [11]:
model

PerformanceRNN(
  (inithid_fc): Linear(in_features=32, out_features=1536, bias=True)
  (inithid_fc_activation): Tanh()
  (event_embedding): Embedding(240, 240)
  (concat_input_fc): Linear(in_features=265, out_features=512, bias=True)
  (concat_input_fc_activation): LeakyReLU(negative_slope=0.1, inplace)
  (gru): DistillerGRU(512, 512, num_layers=3, dropout=0.30, bidirectional=False)
  (output_fc): Linear(in_features=1536, out_features=240, bias=True)
  (output_fc_activation): Softmax()
)