<a href="https://colab.research.google.com/github/murdockbleak/NN-Compression/blob/main/NNSC_Mobile.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
! pip install torch==1.7.1 torchvision==0.8.2 torchaudio==0.7.2

Collecting torch==1.7.1
  Downloading torch-1.7.1-cp37-cp37m-manylinux1_x86_64.whl (776.8 MB)
[K     |████████████████████████████████| 776.8 MB 16 kB/s 
[?25hCollecting torchvision==0.8.2
  Downloading torchvision-0.8.2-cp37-cp37m-manylinux1_x86_64.whl (12.8 MB)
[K     |████████████████████████████████| 12.8 MB 56.7 MB/s 
[?25hCollecting torchaudio==0.7.2
  Downloading torchaudio-0.7.2-cp37-cp37m-manylinux1_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 23.6 MB/s 
Installing collected packages: torch, torchvision, torchaudio
  Attempting uninstall: torch
    Found existing installation: torch 1.10.0+cu111
    Uninstalling torch-1.10.0+cu111:
      Successfully uninstalled torch-1.10.0+cu111
  Attempting uninstall: torchvision
    Found existing installation: torchvision 0.11.1+cu111
    Uninstalling torchvision-0.11.1+cu111:
      Successfully uninstalled torchvision-0.11.1+cu111
  Attempting uninstall: torchaudio
    Found existing installation: torchaudio 

# Save ResNet models in Torchscript format

## Save ResNet model in TorchScript format

In [None]:
%%writefile model_generation_bs10.py

import torch
import torchvision
from torch.utils.bundled_inputs import (
  augment_model_with_bundled_inputs)
from torch.utils.mobile_optimizer import optimize_for_mobile

# Load PyTorch model
model = torchvision.models.resnet18(pretrained=True)
model.eval()

# Generate input image
example = torch.zeros(10, 3, 224, 224)

# Save model graph to TorchScript format
script_module = torch.jit.trace(model, example)

# Optimize for mobile PyTorch operations that are supported by Android framework
# If operations are not supported, they remain unchanged
script_module_optimized = optimize_for_mobile(script_module)

# Create a joint input consisting of model and input image
augment_model_with_bundled_inputs(script_module_optimized, [(example,)])

# Save binary file with model on the computer
torch.jit.save(script_module_optimized, "./resnet18_bs10.pt")

Writing model_generation_bs10.py


In [None]:
!ls

model_generation_bs10.py  model_generation_quantized.py  resnet18_quantized.pt
model_generation.py	  resnet18.pt			 sample_data


In [None]:
!python model_generation_bs10.py

## Save quantized ResNet model in TorchScript format


In [None]:
%%writefile model_generation_quantized_bs10.py

import torch
import torchvision
from torch.utils.bundled_inputs import (
  augment_model_with_bundled_inputs)
from torch.utils.mobile_optimizer import optimize_for_mobile

# Load PyTorch model
model = torchvision.models.quantization.resnet18(pretrained=True, quantize=True)
model.eval()

# Generate input image
example = torch.zeros(10, 3, 224, 224)

# Save model graph to TorchScript format
script_module = torch.jit.trace(model, example)


# Optimize for mobile PyTorch operations that are supported by Android framework
# If operations are not supported, they remain unchanged
script_module_optimized = optimize_for_mobile(script_module)

# Create a joint input consisting of model and input image
augment_model_with_bundled_inputs(script_module_optimized, [(example,)])

# Save binary file with model on the computer
torch.jit.save(script_module_optimized, "./resnet18_quantized_bs10.pt")

Writing model_generation_quantized_bs10.py


In [None]:
!python model_generation_quantized_bs10.py

  reduce_range will be deprecated in a future release of PyTorch."


# Compare memory allocated by TorchScript models

In [None]:
 import os
 print('Size (MB):', os.path.getsize("resnet18_bs10.pt") / 1024**2)
 print('Size (MB):', os.path.getsize("resnet18_quantized_bs10.pt") / 1024**2)


Size (MB): 44.59046459197998
Size (MB): 11.284072875976562


# Compare inference time of TorchScript models

In [None]:
import torch
f_model = torch.jit.load('./resnet18.pt')
q_model = torch.jit.load('./resnet18_quantized.pt')


In [None]:
x = torch.randn((1, 3, 224, 224))

In [None]:
%timeit _ = f_model(x)

  result = self.forward(*input, **kwargs)


10 loops, best of 5: 75.7 ms per loop


In [None]:
%timeit _ = q_model(x)


10 loops, best of 5: 32.4 ms per loop


TorchScript records its definitions in an Intermediate Representation (or IR, commonly referred to in Deep learning as a graph. 

We can examine the graph with the .graph property:

In [None]:
f_model.graph

graph(%self.1 : __torch__.torchvision.models.resnet.___torch_mangle_332.ResNet,
      %input.1 : Tensor):
  %15 : bool = prim::Constant[value=0]() # /usr/local/lib/python3.7/dist-packages/torch/nn/functional.py:586:0
  %8 : int = prim::Constant[value=3]() # :0:0
  %10 : int = prim::Constant[value=2]() # :0:0
  %12 : int = prim::Constant[value=1]() # :0:0
  %139 : int = prim::Constant[value=-1]() # /usr/local/lib/python3.7/dist-packages/torchvision/models/resnet.py:214:0
  %3 : __torch__.torch.classes.xnnpack.Conv2dOpContext = prim::GetAttr[name="prepack_folding._jit_pass_packed_weight_0"](%self.1)
  %6 : Tensor = prepacked::conv2d_clamp_run(%input.1, %3)
  %9 : int[] = prim::ListConstruct(%8, %8)
  %11 : int[] = prim::ListConstruct(%10, %10)
  %13 : int[] = prim::ListConstruct(%12, %12)
  %14 : int[] = prim::ListConstruct(%12, %12)
  %input0.1 : Tensor = aten::max_pool2d(%6, %9, %11, %13, %14, %15) # /usr/local/lib/python3.7/dist-packages/torch/nn/functional.py:586:0
  %18 : __torch__.

In [None]:
q_model.graph

However, this is a very low-level representation and most of the information contained in the graph is not useful for end users. 

Instead, we can use the .code property to give a Python-syntax interpretation of the code

In [None]:
f_model.code

'def forward(self,\n    input: Tensor) -> Tensor:\n  _0 = getattr(self, "prepack_folding._jit_pass_packed_weight_0")\n  _1 = ops.prepacked.conv2d_clamp_run(input, _0)\n  input0 = torch.max_pool2d(_1, [3, 3], [2, 2], [1, 1], [1, 1], False)\n  _2 = getattr(self, "prepack_folding._jit_pass_packed_weight_1")\n  _3 = ops.prepacked.conv2d_clamp_run(input0, _2)\n  _4 = getattr(self, "prepack_folding._jit_pass_packed_weight_2")\n  _5 = ops.prepacked.conv2d_clamp_run(_3, _4)\n  _6 = torch._add_relu_(_5, input0, alpha=1)\n  _7 = getattr(self, "prepack_folding._jit_pass_packed_weight_3")\n  _8 = ops.prepacked.conv2d_clamp_run(_6, _7)\n  _9 = getattr(self, "prepack_folding._jit_pass_packed_weight_4")\n  _10 = ops.prepacked.conv2d_clamp_run(_8, _9)\n  _11 = torch._add_relu_(_10, _6, alpha=1)\n  _12 = getattr(self, "prepack_folding._jit_pass_packed_weight_5")\n  _13 = ops.prepacked.conv2d_clamp_run(_11, _12)\n  _14 = getattr(self, "prepack_folding._jit_pass_packed_weight_6")\n  _15 = ops.prepacked.c

In [None]:
q_model.code

'def forward(self,\n    X: Tensor) -> Tensor:\n  _0 = self.fc\n  input = torch.quantize_per_tensor(X, 0.037445519119501114, 57, 13)\n  _1 = getattr(self, "_jit_pass_hoist_conv_packed_params.conv1._packed_params.1")\n  input0 = ops.quantized.conv2d_relu(input, _1, 0.028605546802282333, 0)\n  input1 = torch.max_pool2d(input0, [3, 3], [2, 2], [1, 1], [1, 1], False)\n  _2 = getattr(self, "_jit_pass_hoist_conv_packed_params.layer1.0.conv1._packed_params.2")\n  input2 = ops.quantized.conv2d_relu(input1, _2, 0.016524722799658775, 0)\n  _3 = getattr(self, "_jit_pass_hoist_conv_packed_params.layer1.0.conv2._packed_params.3")\n  x = ops.quantized.conv2d(input2, _3, 0.046455312520265579, 75)\n  input3 = ops.quantized.add_relu(x, input1, 0.034476079046726227, 0)\n  _4 = getattr(self, "_jit_pass_hoist_conv_packed_params.layer1.1.conv1._packed_params.4")\n  input4 = ops.quantized.conv2d_relu(input3, _4, 0.017180869355797768, 0)\n  _5 = getattr(self, "_jit_pass_hoist_conv_packed_params.layer1.1.conv2

### Why TorchScript?

- TorchScript code can be invoked in its own interpreter, which is basically a restricted Python interpreter. 

- This interpreter does not acquire the Global Interpreter Lock, and so many requests can be processed on the same instance simultaneously.

- This format allows us to save the whole model to disk and load it into another environment, such as in a server written in a language other than Python

- TorchScript gives us a representation in which we can do compiler optimizations on the code to provide more efficient execution

- TorchScript allows us to interface with many backend/device runtimes that require a broader view of the program than individual operators.

- We can see that invoking traced model produces the same results as the Python module

Inroduction to TorchScript: https://pytorch.org/tutorials/beginner/Intro_to_TorchScript_tutorial.html


# Compare inference time and allocated memory of PyTorch models

In [None]:
import torchvision

float_model = torchvision.models.resnet18()
float_model.eval()
for p in float_model.parameters():
  p.requires_grad = False

quantized_model = torchvision.models.quantization.resnet18(quantize=True)


  reduce_range will be deprecated in a future release of PyTorch."


In [None]:
float_model.layer1[0]

BasicBlock(
  (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
  (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)

In [None]:
quantized_model.layer1[0]

QuantizableBasicBlock(
  (conv1): QuantizedConvReLU2d(64, 64, kernel_size=(3, 3), stride=(1, 1), scale=0.005704640876501799, zero_point=0, padding=(1, 1))
  (bn1): Identity()
  (relu): Identity()
  (conv2): QuantizedConv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), scale=0.011008837260305882, zero_point=70, padding=(1, 1))
  (bn2): Identity()
  (add_relu): QFunctional(
    scale=0.007786150556057692, zero_point=0
    (activation_post_process): Identity()
  )
)

In [None]:
x = torch.randn(10, 3, 224, 224)

In [None]:
%timeit float_model(x) 

1 loop, best of 5: 781 ms per loop


In [None]:
%timeit quantized_model(x) 

1 loop, best of 5: 561 ms per loop


In [None]:
def print_size_of_model(model):
    torch.save(model.state_dict(), "temp.p")
    print('Size (MB):', os.path.getsize("temp.p") / 1024**2)
    os.remove('temp.p')

In [None]:
print_size_of_model(float_model)
print_size_of_model(quantized_model)


Size (MB): 44.667840003967285
Size (MB): 11.293700218200684


# Compress with MusCO

In [None]:
!pip install tensorly==0.4.5
!pip install git+https://github.com/musco-ai/musco-pytorch.git@develop

Collecting tensorly==0.4.5
  Downloading tensorly-0.4.5.tar.gz (70 kB)
[K     |████████████████████████████████| 70 kB 5.3 MB/s 
Collecting nose
  Downloading nose-1.3.7-py3-none-any.whl (154 kB)
[K     |████████████████████████████████| 154 kB 17.2 MB/s 
[?25hBuilding wheels for collected packages: tensorly
  Building wheel for tensorly (setup.py) ... [?25l[?25hdone
  Created wheel for tensorly: filename=tensorly-0.4.5-py3-none-any.whl size=100163 sha256=f661fab41223a47c22d07b3cb844e5c6a2472a4a7ea0f833d43968468ea20dbd
  Stored in directory: /root/.cache/pip/wheels/05/ed/36/493bba3faa150a1193eec864db4951355eb64659330cb00722
Successfully built tensorly
Installing collected packages: nose, tensorly
Successfully installed nose-1.3.7 tensorly-0.4.5
Collecting git+https://github.com/musco-ai/musco-pytorch.git@develop
  Cloning https://github.com/musco-ai/musco-pytorch.git (to revision develop) to /tmp/pip-req-build-isshlnhj
  Running command git clone -q https://github.com/musco-ai/mus

In [None]:
from torchvision.models import resnet18
from flopco import FlopCo
from musco.pytorch import Compressor
from musco.pytorch.compressor.utils import standardize_model
import copy

In [None]:
?  Compressor

In [None]:
device = 'cuda'

# Load the model
model = resnet18(pretrained=True).to(device)

# Collect initial model statistics
model_stats = FlopCo(model,
                     img_size = (1, 3, 128, 128),
                     device = device)

# Set a model compression schedule
# model_compr_kwargs = {
#     'layer3.1.conv2': {'decomposition': 'tucker2',
#                        'rank_selection': 'manual',
#                        'manual_rank': [(32, 32), (16, 16)],
#                        'curr_compr_iter': 0
#                       },
#     'layer2.1.conv2': {'decomposition': 'tucker2',
#                        'rank_selection': 'vbmf',
#                        'vbmf_weakenen_factor': 0.9,
#                        'curr_compr_iter': 0
#                       },
#     'fc': {'decomposition': 'svd',
#                       'rank_selection': 'param_reduction',
#                       'param_reduction_rate': 4,
#                       'curr_compr_iter': 0
#                       },
# }


# Initialize a compressor
compressor = Compressor(copy.deepcopy(model),
                        model_stats,
                        ft_every=100,
                        nglobal_compress_iters=1,
                        # model_compr_kwargs = model_compr_kwargs,
                        config_type = 'vbmf'
                        )


# Alernate compression and fine-tuning steps, while compression is not done
# (i.e., until each compressing layer is compressed `nglobal_compress_iters` times)
while not compressor.done:
            # Compress layers
            compressor.compression_step()

            # Fine-tune compressor.compressed_model

# Replace custom layers with standard nn.Module layers.
standardize_model(compressor.compressed_model)

# compressor.compressed_model is our final compressed and standardized model.
compressor.compressed_model

conv1 defaultdict(None, {'decomposition': 'tucker2', 'rank_selection': 'vbmf', 'manual_rank': None, 'param_reduction_rate': None, 'vbmf_weakenen_factor': 0.8, 'curr_compr_iter': 0})
layer1.0.conv1 defaultdict(None, {'decomposition': 'tucker2', 'rank_selection': 'vbmf', 'manual_rank': None, 'param_reduction_rate': None, 'vbmf_weakenen_factor': 0.8, 'curr_compr_iter': 0})
layer1.0.conv2 defaultdict(None, {'decomposition': 'tucker2', 'rank_selection': 'vbmf', 'manual_rank': None, 'param_reduction_rate': None, 'vbmf_weakenen_factor': 0.8, 'curr_compr_iter': 0})
layer1.1.conv1 defaultdict(None, {'decomposition': 'tucker2', 'rank_selection': 'vbmf', 'manual_rank': None, 'param_reduction_rate': None, 'vbmf_weakenen_factor': 0.8, 'curr_compr_iter': 0})
layer1.1.conv2 defaultdict(None, {'decomposition': 'tucker2', 'rank_selection': 'vbmf', 'manual_rank': None, 'param_reduction_rate': None, 'vbmf_weakenen_factor': 0.8, 'curr_compr_iter': 0})
layer2.0.conv1 defaultdict(None, {'decomposition': 'tu

ResNet(
  (conv1): Sequential(
    (conv1-0): Conv2d(3, 3, kernel_size=(1, 1), stride=(1, 1), bias=False)
    (conv1-1): Conv2d(3, 36, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
    (conv1-2): Conv2d(36, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
  )
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Sequential(
        (conv1-0): Conv2d(64, 44, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (conv1-1): Conv2d(44, 36, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (conv1-2): Conv2d(36, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      )
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Sequential(
        (conv2-0): Conv

In [None]:
cmodel = copy.deepcopy(compressor.compressed_model)
cmodel.eval()
for p in cmodel.parameters():
  p.requires_grad = False
print()




In [None]:
import torch
device='cpu'
x = torch.randn((1, 3, 224, 224))
x.to(device)
cmodel.to(device)
print()




In [None]:
%timeit _ = cmodel(x)

10 loops, best of 5: 70.8 ms per loop


In [None]:
model.eval()
for p in model.parameters():
  p.requires_grad = False
model.to(device)

print()




In [None]:
%timeit _ = model(x)


10 loops, best of 5: 88.7 ms per loop


In [None]:
layers_to_fuse = []
for block_id in ['1.0', '1.1', '2.0', '2.1', '3.0', '3.1', '4.0', '4.1']:
  block_layers_to_fuse = [[f'layer{block_id}.conv1.conv1-2', f'layer{block_id}.bn1'],
                          [f'layer{block_id}.conv2.conv2-2', f'layer{block_id}.bn2']]
  layers_to_fuse += block_layers_to_fuse 
  

torch.quantization.fuse_modules(compressor.compressed_model,
                                layers_to_fuse,
                                inplace = True)

ResNet(
  (conv1): Sequential(
    (conv1-0): Conv2d(3, 3, kernel_size=(1, 1), stride=(1, 1), bias=False)
    (conv1-1): Conv2d(3, 36, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
    (conv1-2): Conv2d(36, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
  )
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Sequential(
        (conv1-0): Conv2d(64, 44, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (conv1-1): Conv2d(44, 36, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (conv1-2): ConvBn2d(
          (0): Conv2d(36, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (bn1): Identity()
      (relu): ReLU(inp