## Setup and imports

In [None]:
![ -d "LPRNet_Pytorch" ] && rm -rf LPRNet_Pytorch
!git clone https://github.com/bdbux/LPRNet_Pytorch.git

import torch
from torch import nn
import copy
import math
import random
import time
from collections import OrderedDict, defaultdict
from typing import Union, List

Cloning into 'LPRNet_Pytorch'...
remote: Enumerating objects: 1172, done.[K
remote: Counting objects: 100% (131/131), done.[K
remote: Compressing objects: 100% (58/58), done.[K
remote: Total 1172 (delta 79), reused 116 (delta 73), pack-reused 1041 (from 1)[K
Receiving objects: 100% (1172/1172), 20.06 MiB | 14.05 MiB/s, done.
Resolving deltas: 100% (89/89), done.


In [None]:
!pip install tvm==1.0.0
!python3 -m  pip install mlc-ai-cpu -f https://mlc.ai/wheels
import tvm
from tvm.ir.module import IRModule
from tvm.script import tir as T, relax as R
from tvm import relax
import numpy as np

# This is needed for deferring annotation parsing in TVMScript
from __future__ import annotations

import torch
import torch.nn as nn
from torch import fx
from torch.nn import functional as F


Looking in links: https://mlc.ai/wheels


In [None]:
!pip install torchtune
!pip install torchao
import torchtune as tt
import torch.optim as optim



In [None]:
# Helper functions
def get_sparsity(tensor: torch.Tensor) -> float:
    """
    calculate the sparsity of the given tensor
        sparsity = #zeros / #elements = 1 - #nonzeros / #elements
    """
    return 1 - float(tensor.count_nonzero()) / tensor.numel()


def get_model_sparsity(model: nn.Module) -> float:
    """
    calculate the sparsity of the given model
        sparsity = #zeros / #elements = 1 - #nonzeros / #elements
    """
    num_nonzeros, num_elements = 0, 0
    for param in model.parameters():
        num_nonzeros += param.count_nonzero()
        num_elements += param.numel()
    return 1 - float(num_nonzeros) / num_elements

def get_num_parameters(model: nn.Module, count_nonzero_only=False) -> int:
    """
    calculate the total number of parameters of model
    :param count_nonzero_only: only count nonzero weights
    """
    num_counted_elements = 0
    for param in model.parameters():
        if count_nonzero_only:
            num_counted_elements += param.count_nonzero()
        else:
            num_counted_elements += param.numel()
    return num_counted_elements


def get_model_size(model: nn.Module, data_width=32, count_nonzero_only=False) -> int:
    """
    calculate the model size in bits
    :param data_width: #bits per element
    :param count_nonzero_only: only count nonzero weights
    """
    return get_num_parameters(model, count_nonzero_only) * data_width


## Setup and Original Model Testing

Note that model size is calculated similar to how it was calculated for the pruning project. Onnx was unable to export the LPRNet model into an onnx file without extensive debugging as the provided guide was not very helpful.

In [None]:
import sys, os, torch
sys.path.append('LPRNet_Pytorch')
sys.setrecursionlimit(10000) # apparently this works?

# from test_LPRNet import get_parser, get_model
import test_LPRNet
device = torch.device('cpu')
print(f"Device: {device}")

# Load arguments and customize if needed
args = test_LPRNet.get_parser()
args.pretrained_model = "LPRNet_Pytorch/weights/Final_LPRNet_model.pth"

# Instantiate the model
model = test_LPRNet.get_model(args)

model_size = get_model_size(model, data_width=32)
print(f"Model size: {model_size / (1024 * 1024):.2f} MB")
model_sparsity = get_model_sparsity(model)
print(f"Model sparsity: {model_sparsity:.4f}")

print("Testing Original Model")
test_LPRNet.test(args, model)


Device: cpu
Loaded pretrained model successfully!
Model size: 13.64 MB
Model sparsity: 0.0000
Testing Original Model
Build successful with provided model!


  model.load_state_dict(torch.load(args.pretrained_model, map_location=device))


[Info] Test Accuracy: 0.897 [897:61:42:1000]
[Info] Individual Test Speed: 0.21846267294883728s 1/1000]
Total time: 218.46267294883728 seconds


## RelaxIR module (MLC1)

In [None]:
# Not currently working - please see error message saved below
# and report for more details
"""
state_dict = torch.load("LPRNet_Pytorch/weights/Final_LPRNet_model.pth", map_location=torch.device('cpu'))

def map_param(param: nn.Parameter):
    ndim = len(param.data.shape)
    return relax.const(
        param.data.cpu().numpy(), relax.DynTensorType(ndim, "float32")
    )

def fetch_attr(fx_mod, target: str):
    target_atoms = target.split('.')
    attr_itr = fx_mod
    for i, atom in enumerate(target_atoms):
        if not hasattr(attr_itr, atom):
            raise RuntimeError(f"Node referenced nonexistant target {'.'.join(target_atoms[:i])}")
        attr_itr = getattr(attr_itr, atom)
    return attr_itr

# ReLU
def map_nn_relu_op(bb, node_map, node, nn_mod):
    A = node_map[node.args[0]]
    return bb.emit(relax.op.nn.relu(A))

# Linear
def map_nn_linear_op(bb, node_map, node, nn_mod):
    # print(node_map)
    # print(node)
    # if node.args[0] == "view":
    #     return map_nn_view_op(bb, node_map, node, nn_mod)

    x = node_map[node.args[0]]
    w = map_param(state_dict[f"{node.target}.weight"])
    if nn_mod.bias is not None:
        b = map_param(nn_mod.bias)
    return bb.emit(relax.op.linear(x, w, b))

# View
def map_nn_view_op(bb, node_map, node, nn_mod):
    #print(node_map)
    #print(node)
    x = node_map[node.args[0]]
    new_shape = tuple(node.args[1:])
    return bb.emit(relax.op.reshape(x, new_shape))

# Conv2d
def map_nn_conv2d_op(bb, node_map, node, nn_mod):
    x = node_map[node.args[0]]

    weight = map_param(state_dict[f"{node.target}.weight"])
    bias = map_param(state_dict[f"{node.target}.bias"]) if nn_mod.bias is not None else None

    stride = nn_mod.stride
    padding = nn_mod.padding
    dilation = nn_mod.dilation
    groups = nn_mod.groups

    conv = bb.emit(relax.op.nn.conv2d(
        x, weight, strides=stride, padding=padding, dilation=dilation, groups=groups
    ))

    # Apply bias if it exists
    if bias is not None:
        # Reshape bias to match the convolutional output
        bias = bb.emit(relax.op.reshape(bias, (1, -1, 1, 1)))
        conv = bb.emit(relax.op.add(conv, bias))

    return conv

#BatchNorm2d
def map_nn_batchnorm2d_op(bb, node_map, node, nn_mod):
    x = node_map[node.args[0]]

    # Map parameters for BatchNorm2d: weight (gamma), bias (beta), running mean, running var
    gamma = map_param(state_dict[f"{node.target}.weight"]) if nn_mod.weight is not None else None
    beta = map_param(state_dict[f"{node.target}.bias"]) if nn_mod.bias is not None else None
    running_mean = map_param(state_dict[f"{node.target}.running_mean"])
    running_var = map_param(state_dict[f"{node.target}.running_var"])

    # Extract BatchNorm2d attributesm
    eps = nn_mod.eps
    momentum = nn_mod.momentum

    return bb.emit(relax.op.nn.batch_norm(
        x, gamma, beta, running_mean, running_var, axis=1, epsilon=eps, momentum=momentum
    )[0])

# Max Pool
def map_nn_maxpool3d_op(bb, node_map, node, nn_mod):
    # print(f"Node map: {node_map.keys()}")
    # print("NODE ARGS:")
    # print(node.args)
    # Extract input tensor
    x = node_map[node.args[0]]
    # print(f"X: {x}")

    kernel_size = (1, 3, 3)
    stride = node.kwargs.get('stride')
    if stride is None:
        stride = kernel_size
    if not isinstance(stride, tuple):
        stride = (stride, stride)

    padding = node.kwargs.get('padding', 0)
    dilation = node.kwargs.get('dilation', 1)

    return bb.emit(relax.op.nn.max_pool3d(
        x, pool_size=kernel_size, strides=stride, padding=padding, dilation=dilation
    ))

# AvgPool2d
def map_nn_avgpool2d_op(bb, node_map, node, nn_mod):
    x = node_map[node.args[0]]

    # extract kernel size, stride, and padding
    kernel_size = nn_mod.kernel_size
    stride = nn_mod.stride if nn_mod.stride is not None else kernel_size
    padding = nn_mod.padding

    return bb.emit(relax.op.nn.avg_pool2d(
        x, pool_size=kernel_size, strides=stride, padding=padding
    ))

# Translate pytorch computational graph
def from_fx(fx_mod, input_shapes, call_function_map, call_module_map):
    input_index = 0
    node_map = {}
    named_modules = dict(fx_mod.named_modules())
    nn_mod = named_modules

    bb = relax.BlockBuilder()

    fn_inputs = []
    fn_output = None

    with bb.function("main"):
        with bb.dataflow():
            for node in fx_mod.graph.nodes:
                if node.op == "placeholder":
                    # create input placeholder
                    shape = input_shapes[input_index]
                    input_index += 1
                    input_var = relax.Var(node.target, relax.TensorStructInfo(shape, "float32"))
                    fn_inputs.append(input_var)
                    node_map[node] = input_var
                elif node.op == "get_attr":
                    node_map[node] = map_param(fetch_attr(fx_mod, node.target))
                elif node.op == "call_function":
                    if node.target in call_function_map:
                        node_map[node] = call_function_map[node.target](bb, node_map, node)
                elif node.op == "call_module":
                    named_module = named_modules[node.target]
                    node_map[node] = call_module_map[type(named_module)](bb, node_map, node, named_module)
                elif node.op == "call_method":
                    # Explicitly handle view
                    if node.target == "view":
                        node_map[node] = map_nn_view_op(bb, node_map, node, fx_mod)
                elif node.op == "output":
                    output = node_map[node.args[0]]
                    assert fn_output is None
                    fn_output = bb.emit_output(output)

        bb.emit_func_output(fn_output, fn_inputs)

        print(node_map)
        print(bb)

    return bb.get()

import torch.fx

# use fx graph translation
fx_model = fx.symbolic_trace(model)
# print(fx_model.print_readable)

# translate from FX representation to Relax IR module
RelaxModule = from_fx(
    fx_model,
    input_shapes = [(1, 3, 94, 24)],
    call_function_map = {
        torch.nn.functional.relu: map_nn_relu_op,
        torch.nn.functional.max_pool3d: map_nn_maxpool3d_op,
        torch.Tensor.view: map_nn_view_op,

    },
    call_module_map={
        torch.nn.Linear: map_nn_linear_op,
        torch.nn.Conv2d: map_nn_conv2d_op,
        torch.nn.BatchNorm2d: map_nn_batchnorm2d_op,
        torch.nn.AvgPool2d: map_nn_avgpool2d_op,
        torch.nn.ReLU: map_nn_relu_op,
        torch.nn.MaxPool3d: map_nn_maxpool3d_op,
    },
)

RelaxModule.show()
ex = relax.vm_build.build(RelaxModule, target="llvm")
print(ex)
"""


'\nstate_dict = torch.load("LPRNet_Pytorch/weights/Final_LPRNet_model.pth", map_location=torch.device(\'cpu\'))\n\ndef map_param(param: nn.Parameter):\n    ndim = len(param.data.shape)\n    return relax.const(\n        param.data.cpu().numpy(), relax.DynTensorType(ndim, "float32")\n    )\n\ndef fetch_attr(fx_mod, target: str):\n    target_atoms = target.split(\'.\')\n    attr_itr = fx_mod\n    for i, atom in enumerate(target_atoms):\n        if not hasattr(attr_itr, atom):\n            raise RuntimeError(f"Node referenced nonexistant target {\'.\'.join(target_atoms[:i])}")\n        attr_itr = getattr(attr_itr, atom)\n    return attr_itr\n\n# ReLU\ndef map_nn_relu_op(bb, node_map, node, nn_mod):\n    A = node_map[node.args[0]]\n    return bb.emit(relax.op.nn.relu(A))\n\n# Linear\ndef map_nn_linear_op(bb, node_map, node, nn_mod):\n    # print(node_map)\n    # print(node)\n    # if node.args[0] == "view":\n    #     return map_nn_view_op(bb, node_map, node, nn_mod)\n\n    x = node_map[nod

```
# RelaxIR module error message
Node map: dict_keys([x, backbone_0, backbone_1, backbone_2])
NODE ARGS:
(backbone_2,)
X: lv5
---------------------------------------------------------------------------
TVMError                                  Traceback (most recent call last)
<ipython-input-55-00b0aa009f31> in <cell line: 193>()
    191
    192 # Translate from FX representation to Relax IR module
--> 193 RelaxModule = from_fx(
    194     fx_model,
    195     input_shapes = [(1, 3, 94, 24)],

3 frames
<ipython-input-55-00b0aa009f31> in from_fx(fx_mod, input_shapes, call_function_map, call_module_map)
    166                 elif node.op == "call_module":
    167                     named_module = named_modules[node.target]
--> 168                     node_map[node] = call_module_map[type(named_module)](bb, node_map, node, named_module)
    169                 elif node.op == "call_method":
    170                     # Explicitly handle view

<ipython-input-55-00b0aa009f31> in map_nn_maxpool3d_op(bb, node_map, node, nn_mod)
    118
    119     # Emit the Relax max pool 3d operation
--> 120     return bb.emit(relax.op.nn.max_pool3d(
    121         x, pool_size=kernel_size, strides=stride, padding=padding, dilation=dilation
    122     ))

/usr/local/lib/python3.10/dist-packages/tvm/relax/block_builder.py in emit(self, expr, name_hint)
    321         """
    322         expr = self._normalize_python_tuple(expr)
--> 323         return _ffi_api.BlockBuilderEmit(self, expr, name_hint)  # type: ignore
    324
    325     def call_te(self, func: Callable, *args: Any, **kwargs: Any) -> Expr:

tvm/_ffi/_cython/./packed_func.pxi in tvm._ffi._cy3.core.PackedFuncBase.__call__()

tvm/_ffi/_cython/./packed_func.pxi in tvm._ffi._cy3.core.FuncCall()

tvm/_ffi/_cython/./packed_func.pxi in tvm._ffi._cy3.core.FuncCall3()

tvm/_ffi/_cython/./base.pxi in tvm._ffi._cy3.core.CHECK_CALL()

/usr/local/lib/python3.10/dist-packages/tvm/_ffi/base.py in raise_last_ffi_error()
    479     _LIB.TVMDropLastPythonError()
    480
--> 481     raise py_err
    482
    483

TVMError: Traceback (most recent call last):
  13: tvm::runtime::PackedFuncObj::Extractor<tvm::runtime::PackedFuncSubObj<tvm::runtime::TypedPackedFunc<tvm::relax::Var (tvm::relax::BlockBuilder, tvm::RelayExpr, tvm::runtime::String)>::AssignTypedLambda<tvm::relax::__mk_TVM6::{lambda(tvm::relax::BlockBuilder, tvm::RelayExpr, tvm::runtime::String)#1}>(tvm::relax::__mk_TVM6::{lambda(tvm::relax::BlockBuilder, tvm::RelayExpr, tvm::runtime::String)#1}, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >)::{lambda(tvm::runtime::TVMArgs const&, tvm::runtime::TVMRetValue*)#1}> >::Call(tvm::runtime::PackedFuncObj const*, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, tvm::runtime::TVMRetValue)
  12: tvm::relax::BlockBuilderImpl::Emit(tvm::RelayExpr, tvm::runtime::String)
  11: tvm::relax::BlockBuilderImpl::Emit(tvm::RelayExpr, bool, tvm::runtime::String)
  10: tvm::relax::Normalizer::Normalize(tvm::RelayExpr const&)
  9: tvm::relax::ExprFunctor<tvm::RelayExpr (tvm::RelayExpr const&)>::VisitExpr(tvm::RelayExpr const&)
  8: _ZZN3tvm5relax11ExprFuncto
  7: tvm::relax::Normalizer::VisitExpr_(tvm::relax::CallNode const*)
  6: tvm::relax::Normalizer::InferStructInfo(tvm::relax::Call const&)
  5: _ZN3tvm7runtime13PackedFun
  4: tvm::runtime::TypedPackedFunc<tvm::relax::StructInfo (tvm::relax::Call const&, tvm::relax::BlockBuilder const&)>::AssignTypedLambda<tvm::relax::StructInfo (*)(tvm::relax::Call const&, tvm::relax::BlockBuilder const&)>(tvm::relax::StructInfo (*)(tvm::relax::Call const&, tvm::relax::BlockBuilder const&))::{lambda(tvm::runtime::TVMArgs const&, tvm::runtime::TVMRetValue*)#1}::operator()(tvm::runtime::TVMArgs const&, tvm::runtime::TVMRetValue*) const
  3: tvm::relax::InferStructInfoPool3D(tvm::relax::Call const&, tvm::relax::BlockBuilder const&)
  2: tvm::relax::CheckNdimPerLayoutAndGetShape(tvm::relax::Call const&, tvm::relax::BlockBuilder const&, tvm::relax::TensorStructInfo const&, tvm::tir::Layout const&)
  1: _ZN3tvm5relax16BlockBuilderImpl11ReportFatalERKNS_1
  0: _ZN3tvm7runtime6deta
  File "/workspace/tvm/src/relax/ir/block_builder.cc", line 158
TVMError: In Op(relax.nn.max_pool3d), layout NCDHW requires the input to be 5-dim tensor. However, the given input has ndim 4
```

## Tensor Expression (MLC 2)

In [None]:
import tvm
from tvm import te
import numpy as np

def direct_conv2d(N, C, H, W, K, R, S, stride_h, stride_w, pad_h, pad_w, Input, Kernel):
    P = (H - R + 2 * pad_h) // stride_h + 1
    Q = (W - S + 2 * pad_w) // stride_w + 1
    rc = te.reduce_axis((0, C), name="rc")
    ry = te.reduce_axis((0, R), name="ry")
    rx = te.reduce_axis((0, S), name="rx")
    Conv = te.compute(
        (N, K, P, Q),
        lambda n, k, p, q: te.sum(
            Input[n, rc, p * stride_h + ry - pad_h, q * stride_w + rx - pad_w] *
            Kernel[k, rc, ry, rx],
            axis=[rc, ry, rx]
        ),
        name="Conv"
    )
    return Conv, P, Q

def relu(N, C, H, W, Input):
    return te.compute((N, C, H, W),
                      lambda n, c, h, w: te.max(Input[n, c, h, w], tvm.tir.const(0.0, "float32")),
                      name="ReLU")

def batchnorm2d(N, C, H, W, Input, gamma, beta, mean, var, epsilon=1e-5):
    return te.compute(
        (N, C, H, W),
        lambda n, c, h, w: (Input[n, c, h, w] - mean[c]) / te.sqrt(var[c] + epsilon) * gamma[c] + beta[c],
        name="BatchNorm"
    )

def maxpool2d(N, C, H, W, Input, pool_h, pool_w, stride_h, stride_w):
    outH = (H - pool_h) // stride_h + 1
    outW = (W - pool_w) // stride_w + 1
    ph = te.reduce_axis((0, pool_h), "ph")
    pw = te.reduce_axis((0, pool_w), "pw")
    Pool = te.compute(
        (N, C, outH, outW),
        lambda n, c, h, w: te.max(Input[n, c, h*stride_h+ph, w*stride_w+pw], axis=[ph, pw]),
        name="MaxPool2D"
    )
    return Pool, outH, outW

def avgpool2d(N, C, H, W, Input, pool_h, pool_w, stride_h, stride_w):
    outH = (H - pool_h) // stride_h + 1
    outW = (W - pool_w) // stride_w + 1
    ph = te.reduce_axis((0, pool_h), "ph")
    pw = te.reduce_axis((0, pool_w), "pw")
    factor = pool_h * pool_w
    Avg = te.compute(
        (N, C, outH, outW),
        lambda n, c, h, w: te.sum(Input[n, c, h*stride_h+ph, w*stride_w+pw], axis=[ph, pw]) / factor,
        name="AvgPool2D"
    )
    return Avg, outH, outW

def small_basic_block(N, Cin, H, W, Cout, Input,
                      W_1x1a, W_3x1, W_1x3, W_1x1b):

    # 1x1 conv
    Conv1, P1, Q1 = direct_conv2d(N, Cin, H, W, Cout//4, 1, 1, 1, 1, 0, 0, Input, W_1x1a)
    ReLU1 = relu(N, Cout//4, P1, Q1, Conv1)

    # 3x1 conv (pad=(1,0))
    Conv2, P2, Q2 = direct_conv2d(N, Cout//4, P1, Q1, Cout//4, 3, 1, 1, 1, 1, 0, ReLU1, W_3x1)
    ReLU2 = relu(N, Cout//4, P2, Q2, Conv2)

    # 1x3 conv (pad=(0,1))
    Conv3, P3, Q3 = direct_conv2d(N, Cout//4, P2, Q2, Cout//4, 1, 3, 1, 1, 0, 1, ReLU2, W_1x3)
    ReLU3 = relu(N, Cout//4, P3, Q3, Conv3)

    # final 1x1 conv
    Conv4, P4, Q4 = direct_conv2d(N, Cout//4, P3, Q3, Cout, 1, 1, 1, 1, 0, 0, ReLU3, W_1x1b)
    return Conv4, P4, Q4

# Input dimensions
N, C, H, W = 1, 3, 94, 24
Input_tensor = te.placeholder((N, C, H, W), name="Input", dtype="float32")

# first layer: Conv(3->64, kernel=3x3, stride=1)
W_conv1 = te.placeholder((64, 3, 3, 3), name="W_conv1", dtype="float32")
Conv1, H1, W1 = direct_conv2d(N, 3, H, W, 64, 3, 3, 1, 1, 0, 0, Input_tensor, W_conv1)

# BN + ReLU on Conv1
gamma1 = te.placeholder((64,), name="gamma1")
beta1  = te.placeholder((64,), name="beta1")
mean1  = te.placeholder((64,), name="mean1")
var1   = te.placeholder((64,), name="var1")

BN1 = batchnorm2d(N, 64, H1, W1, Conv1, gamma1, beta1, mean1, var1)
ReLU1 = relu(N, 64, H1, W1, BN1)

# MaxPool2d(3x3,stride=1x1)
Pool1, H2, W2 = maxpool2d(N, 64, H1, W1, ReLU1, 3, 3, 1, 1)

# small_basic_block(ch_in=64, ch_out=128)
W_1x1a_128 = te.placeholder((128//4, 64, 1, 1), name="W_1x1a_128")   # (32,64,1,1)
W_3x1_128  = te.placeholder((128//4, 128//4, 3, 1), name="W_3x1_128") # (32,32,3,1)
W_1x3_128  = te.placeholder((128//4, 128//4, 1, 3), name="W_1x3_128") # (32,32,1,3)
W_1x1b_128 = te.placeholder((128, 128//4, 1, 1), name="W_1x1b_128")   # (128,32,1,1)

Block1, H3, W3 = small_basic_block(N, 64, H2, W2, 128,
                                   Pool1, W_1x1a_128, W_3x1_128, W_1x3_128, W_1x1b_128)

# BN + ReLU for Block1 output
gamma2 = te.placeholder((128,), name="gamma2")
beta2  = te.placeholder((128,), name="beta2")
mean2  = te.placeholder((128,), name="mean2")
var2   = te.placeholder((128,), name="var2")

BN2 = batchnorm2d(N, 128, H3, W3, Block1, gamma2, beta2, mean2, var2)
ReLU2 = relu(N, 128, H3, W3, BN2)

# Next maxpool2d(3x3, stride=2x2)
Pool2, H4, W4 = maxpool2d(N, 128, H3, W3, ReLU2, 3, 3, 2, 2)

s = te.create_schedule([Pool2.op])

te_model = tvm.build(s, [Input_tensor, W_conv1, gamma1, beta1, mean1, var1,
                   W_1x1a_128, W_3x1_128, W_1x3_128, W_1x1b_128,
                   gamma2, beta2, mean2, var2, Pool2],
              target="llvm", name="lprnet")

print(te_model)

ctx = tvm.cpu(0)

# create fake data just to try and test
input_data = np.random.rand(N, C, H, W).astype("float32")
W_conv1_data = np.random.rand(64,3,3,3).astype("float32")
gamma1_data = np.random.rand(64).astype("float32")
beta1_data = np.random.rand(64).astype("float32")
mean1_data = np.random.rand(64).astype("float32")
var1_data = np.random.rand(64).astype("float32")

W_1x1a_128_data = np.random.rand(32,64,1,1).astype("float32")
W_3x1_128_data = np.random.rand(32,32,3,1).astype("float32")
W_1x3_128_data = np.random.rand(32,32,1,3).astype("float32")
W_1x1b_128_data = np.random.rand(128,32,1,1).astype("float32")

gamma2_data = np.random.rand(128).astype("float32")
beta2_data = np.random.rand(128).astype("float32")
mean2_data = np.random.rand(128).astype("float32")
var2_data = np.random.rand(128).astype("float32")

input_tvm = tvm.nd.array(input_data, device=ctx)
W_conv1_tvm = tvm.nd.array(W_conv1_data, device=ctx)
gamma1_tvm = tvm.nd.array(gamma1_data, device=ctx)
beta1_tvm = tvm.nd.array(beta1_data, device=ctx)
mean1_tvm = tvm.nd.array(mean1_data, device=ctx)
var1_tvm = tvm.nd.array(var1_data, device=ctx)

W_1x1a_128_tvm = tvm.nd.array(W_1x1a_128_data, device=ctx)
W_3x1_128_tvm = tvm.nd.array(W_3x1_128_data, device=ctx)
W_1x3_128_tvm = tvm.nd.array(W_1x3_128_data, device=ctx)
W_1x1b_128_tvm = tvm.nd.array(W_1x1b_128_data, device=ctx)

gamma2_tvm = tvm.nd.array(gamma2_data, device=ctx)
beta2_tvm = tvm.nd.array(beta2_data, device=ctx)
mean2_tvm = tvm.nd.array(mean2_data, device=ctx)
var2_tvm = tvm.nd.array(var2_data, device=ctx)

print("Pool2 shape:", (N, 128, H4, W4))
out_tvm = tvm.nd.empty((N, 128, H4, W4), dtype="float32", device=ctx)

te_model(
    input_tvm,
    W_conv1_tvm, gamma1_tvm, beta1_tvm, mean1_tvm, var1_tvm,
    W_1x1a_128_tvm, W_3x1_128_tvm, W_1x3_128_tvm, W_1x1b_128_tvm,
    gamma2_tvm, beta2_tvm, mean2_tvm, var2_tvm,
    out_tvm
)
print("Output shape:", out_tvm.shape)
print("Output sample:", out_tvm.asnumpy()[0,0,0:5,0:5])


Module(llvm, 5c0a52e3ccc8)
Pool2 shape: (1, 128, 44, 9)
Output shape: (1, 128, 44, 9)
Output sample: [[3.2731887e+18 9.3689930e+06 9.3689930e+06 9.6803650e+06 9.6803650e+06]
 [9.8756460e+06 9.4522370e+06 9.2046290e+06 9.6803650e+06 9.6803650e+06]
 [9.8756460e+06 9.4832160e+06 8.9317540e+06 9.4175800e+06 9.4783460e+06]
 [9.4166190e+06 9.3443450e+06 8.9317540e+06 9.1230680e+06 9.6107960e+06]
 [9.1347880e+06 8.9859090e+06 8.7664820e+06 8.9080760e+06 9.4311190e+06]]


## Fine-Grained Pruning (MO1)

In [None]:
def fine_grained_prune(tensor: torch.Tensor, sparsity : float) -> torch.Tensor:
    sparsity = min(max(0.0, sparsity), 1.0)
    if sparsity == 1.0:
        tensor.zero_()
        return torch.zeros_like(tensor)
    elif sparsity == 0.0:
        return torch.ones_like(tensor)

    num_elements = tensor.numel()

    num_zeros = round(num_elements * sparsity)
    importance = tensor.abs()
    threshold = importance.view(-1).kthvalue(num_zeros).values
    mask = torch.gt(importance, threshold)
    tensor.mul_(mask)

    return mask

In [None]:
class FineGrainedPruner:
    def __init__(self, model, sparsity_dict):
        self.masks = FineGrainedPruner.prune(model, sparsity_dict)

    @torch.no_grad()
    def apply(self, model):
        for name, param in model.named_parameters():
            if name in self.masks:
                param *= self.masks[name]

    @staticmethod
    @torch.no_grad()
    def prune(model, sparsity_dict):
        masks = dict()
        for name, param in model.named_parameters():
            if param.dim() > 1: # we only prune conv and fc weights
                if isinstance(sparsity_dict, dict):
                    masks[name] = fine_grained_prune(param, sparsity_dict[name])
                else:
                    assert(sparsity_dict < 1 and sparsity_dict >= 0)
                    if sparsity_dict > 0:
                        masks[name] = fine_grained_prune(param, sparsity_dict)
        return masks

In [None]:
sparsity = 0.1
sparse_model = copy.deepcopy(model)

while sparsity < 1:

    pruner = FineGrainedPruner(sparse_model, sparsity)
    pruner.apply(sparse_model)
    print(f"Pruned model with sparsity: {sparsity}")
    sparse_model_size = get_model_size(sparse_model, data_width=32, count_nonzero_only=True)

    print(f"Model size: {sparse_model_size / (1024 * 1024):.2f} MB")
    test_LPRNet.test(args, sparse_model)

    print("---------------")
    print()

    sparsity += 0.1

```
#RAW DATA WITH 0.1 INCREMENTS
Pruned model with sparsity: 0.1
Model size: 12.29 MB
Build successful with provided model!
[Info] Test Accuracy: 0.898 [898:62:40:1000]
[Info] Individual Test Speed: 0.17621557784080505s 1/1000]
Total time: 176.21557784080505 seconds
---------------

Pruned model with sparsity: 0.2
Model size: 10.93 MB
Build successful with provided model!
[Info] Test Accuracy: 0.898 [898:64:38:1000]
[Info] Individual Test Speed: 0.17063515448570252s 1/1000]
Total time: 170.63515448570251 seconds
---------------

Pruned model with sparsity: 0.30000000000000004
Model size: 9.58 MB
Build successful with provided model!
[Info] Test Accuracy: 0.884 [884:71:45:1000]
[Info] Individual Test Speed: 0.15751717019081116s 1/1000]
Total time: 157.51717019081116 seconds
---------------

Pruned model with sparsity: 0.4
Model size: 8.23 MB
Build successful with provided model!
[Info] Test Accuracy: 0.876 [876:76:48:1000]
[Info] Individual Test Speed: 0.1495432131290436s 1/1000]
Total time: 149.54321312904358 seconds
---------------

Pruned model with sparsity: 0.5
Model size: 6.88 MB
Build successful with provided model!
[Info] Test Accuracy: 0.822 [822:102:76:1000]
[Info] Individual Test Speed: 0.1377841682434082s 1/1000]
Total time: 137.7841682434082 seconds
---------------

Pruned model with sparsity: 0.6
Model size: 5.52 MB
Build successful with provided model!
[Info] Test Accuracy: 0.677 [677:156:167:1000]
[Info] Individual Test Speed: 0.13184543347358704s 1/1000]
Total time: 131.84543347358704 seconds
---------------

Pruned model with sparsity: 0.7
Model size: 4.17 MB
Build successful with provided model!
[Info] Test Accuracy: 0.017 [17:869:114:1000]
[Info] Individual Test Speed: 0.11505629849433899s 1/1000]
Total time: 115.05629849433899 seconds
---------------

Pruned model with sparsity: 0.7999999999999999
Model size: 2.82 MB
Build successful with provided model!
[Info] Test Accuracy: 0.0 [0:944:56:1000]
[Info] Individual Test Speed: 0.07019736027717591s 1/1000]
Total time: 70.1973602771759 seconds
---------------

Pruned model with sparsity: 0.8999999999999999
Model size: 1.46 MB
Build successful with provided model!
[Info] Test Accuracy: 0.0 [0:924:76:1000]
[Info] Individual Test Speed: 0.029899073123931884s 1/1000]
Total time: 29.899073123931885 seconds
---------------

Pruned model with sparsity: 0.9999999999999999
Model size: 0.11 MB
Build successful with provided model!
[Info] Test Accuracy: 0.0 [0:1000:0:1000]
[Info] Individual Test Speed: 0.028096100807189942s 1/1000]
Total time: 28.09610080718994 seconds
---------------
```

## Fusion and Static Quantization (MO2)

Quantization done following this guide: https://pytorch.org/docs/stable/quantization.html


In [None]:
import torch
import torch.nn as nn
import torch.quantization
from torch.quantization import fuse_modules, get_default_qconfig, prepare, convert
import copy

qmodel = copy.deepcopy(model).to('cpu').eval()

# Set quantization engine
torch.backends.quantized.engine = 'x86'

fusion_list = [
    # Initial Conv-BN-ReLU
    ["backbone.0", "backbone.1", "backbone.2"],

    # small_basic_block at backbone.4
    # Conv->ReLU three times, then a final Conv
    ["backbone.4.block.0", "backbone.4.block.1"],
    ["backbone.4.block.2", "backbone.4.block.3"],
    ["backbone.4.block.4", "backbone.4.block.5"],
    # Fuse last Conv of block.4 with following BN and ReLU
    ["backbone.4.block.6", "backbone.5", "backbone.6"],

    # small_basic_block at backbone.8
    ["backbone.8.block.0", "backbone.8.block.1"],
    ["backbone.8.block.2", "backbone.8.block.3"],
    ["backbone.8.block.4", "backbone.8.block.5"],
    # Fuse last Conv of block.8 with following BN and ReLU
    ["backbone.8.block.6", "backbone.9", "backbone.10"],

    # small_basic_block at backbone.11
    ["backbone.11.block.0", "backbone.11.block.1"],
    ["backbone.11.block.2", "backbone.11.block.3"],
    ["backbone.11.block.4", "backbone.11.block.5"],
    # Fuse last Conv of block.11 with following BN and ReLU
    ["backbone.11.block.6", "backbone.12", "backbone.13"],

    # Fuse Conv-BN-ReLU near theend
    ["backbone.16", "backbone.17", "backbone.18"],
    ["backbone.20", "backbone.21", "backbone.22"],
]


# Make a copy of the model to fuse
model_fused = fuse_modules(qmodel, fusion_list, inplace=False)

model_fused.qconfig = get_default_qconfig('x86')

# Calibrate the model?
print("Testing fused model")

test_LPRNet.test(args, model_fused)

fused_model_size = get_model_size(model_fused, data_width=32)
print(f"Fused Model size: {fused_model_size / (1024 * 1024):.2f} MB")


Testing fused model
Build successful with provided model!
[Info] Test Accuracy: 0.897 [897:61:42:1000]
[Info] Individual Test Speed: 0.03599678826332092s 1/1000]
Total time: 35.99678826332092 seconds
Fused Model size: 13.58 MB


```
# Raw fused model results
Testing fused model
Build successful with provided model!
[Info] Test Accuracy: 0.899 [899:58:43:1000]
[Info] Individual Test Speed: 0.03917880201339722s 1/1000]
Total time: 39.17880201339722 seconds
Model size: 13.58 Mb
```

In [None]:
# Combine fusion and unstructured pruning
combined_model = copy.deepcopy(model_fused)
pruner = FineGrainedPruner(combined_model, 0.4)
pruner.apply(combined_model)

combined_model_size = get_model_size(combined_model, data_width=32, count_nonzero_only=True)

print(f"Combined Model size: {combined_model_size / (1024 * 1024):.2f} MB")
test_LPRNet.test(args, combined_model)


Combined Model size: 6.77 MB
Build successful with provided model!
[Info] Test Accuracy: 0.873 [873:80:47:1000]
[Info] Individual Test Speed: 0.034136982679367066s 1/1000]
Total time: 34.136982679367065 seconds


```
# Combined model data
Combined Model size: 6.77 MB
Build successful with provided model!
[Info] Test Accuracy: 0.872 [872:76:52:1000]
[Info] Individual Test Speed: 0.033661202907562256s 1/1000]
Total time: 33.661202907562256 seconds
```

In [None]:
# Static Quantization - Currently throws a runtime error
# Remove triple quotes for testing
"""
# Prepare the model for static quantization
prepared_model = prepare(model_fused)

# convert the model to a quantized version
quantized_model = convert(model_fused, inplace=False).to('cpu')
prepared_model.eval()
N, C, H, W = 100, 3, 94, 24 # should batch size be 100 or 1?
input_data = torch.randn(N, C, H, W)
prepared_model(input_data)

quantized_model = convert(prepared_model)

# print(quantized_model)

print("Testing quantized model")

test_LPRNet.test(args, quantized_model)

quantized_model_size = get_model_size(quantized_model, data_width=8)
print(f"Model size: {quantized_model_size / (1024 * 1024):.2f} MB")
"""

'\n# Prepare the model for static quantization\nprepared_model = prepare(model_fused)\n\n# convert the model to a quantized version\nquantized_model = convert(model_fused, inplace=False).to(\'cpu\')\nprepared_model.eval()\nN, C, H, W = 100, 3, 94, 24 # should batch size be 100 or 1?\ninput_data = torch.randn(N, C, H, W)\nprepared_model(input_data)\n\nquantized_model = convert(prepared_model)\n\n# print(quantized_model)\n\nprint("Testing quantized model")\n\ntest_LPRNet.test(args, quantized_model)\n\nquantized_model_size = get_model_size(quantized_model, data_width=8)\nprint(f"Model size: {quantized_model_size / (1024 * 1024):.2f} MB")\n'

```
# Quantized model Error Message
/usr/local/lib/python3.10/dist-packages/torch/ao/quantization/observer.py:229: UserWarning: Please use quant_min and quant_max to specify the range for observers.                     reduce_range will be deprecated in a future release of PyTorch.
  warnings.warn(
---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
<ipython-input-9-528930bb2c49> in <cell line: 9>()
      7 N, C, H, W = 100, 3, 94, 24
      8 input_data = torch.randn(N, C, H, W)
----> 9 prepared_model(input_data)
     10
     11 quantized_model = convert(prepared_model)

5 frames
/usr/local/lib/python3.10/dist-packages/torch/nn/modules/pooling.py in forward(self, input)
    754
    755     def forward(self, input: Tensor) -> Tensor:
--> 756         return F.avg_pool2d(
    757             input,
    758             self.kernel_size,

RuntimeError: Given input size: (256x88x9). Calculated output size: (256x22x0). Output size is too small
```

In [None]:
# Previous attempt to do dynamic quantization
# Kept just for the record, but no improvements over original model
"""
import torch
import torch.quantization
torch.backends.quantized.engine = 'fbgemm'

qmodel = copy.deepcopy(model).eval()

for name, module in qmodel.named_modules():
    print(name, module)

def quantize_model(model):
    for name, module in model.named_modules():
        if isinstance(module, torch.nn.Conv2d):
            torch.quantization.convert(module, inplace=True)
    return torch.quantization.quantize_dynamic(
        model, {torch.nn.Conv2d, torch.nn.Linear}, dtype=torch.qint8
    )

quantized_model = torch.quantization.quantize_dynamic(
    qmodel, {torch.nn.Conv2d}, dtype=torch.qint8
)

quantized_model = quantize_model(qmodel)

print(quantized_model)

print("Testing quantized model")
test_LPRNet.test(args, quantized_model)

quantized_model_size = get_model_size(quantized_model, data_width=8)
print(f"Model size: {quantized_model_size / (1024 * 1024):.2f} MB")
"""

'\nimport torch\nimport torch.quantization\ntorch.backends.quantized.engine = \'fbgemm\'\n\nqmodel = copy.deepcopy(model).eval()\n\nfor name, module in qmodel.named_modules():\n    print(name, module)\n\ndef quantize_model(model):\n    for name, module in model.named_modules():\n        if isinstance(module, torch.nn.Conv2d):\n            torch.quantization.convert(module, inplace=True)\n    return torch.quantization.quantize_dynamic(\n        model, {torch.nn.Conv2d, torch.nn.Linear}, dtype=torch.qint8\n    )\n\nquantized_model = torch.quantization.quantize_dynamic(\n    qmodel, {torch.nn.Conv2d}, dtype=torch.qint8\n)\n\nquantized_model = quantize_model(qmodel)\n\nprint(quantized_model)\n\nprint("Testing quantized model")\ntest_LPRNet.test(args, quantized_model)\n\nquantized_model_size = get_model_size(quantized_model, data_width=8)\nprint(f"Model size: {quantized_model_size / (1024 * 1024):.2f} MB")\n'

## Original testing script


In [None]:
!python3 LPRNet_Pytorch/test_LPRNet.py

## Raw data

Note: All testing is done using CPU rather than cuda, as some optimized models were having issues.

Original Model:
- Model size: 13.64 MB
- Model sparsity: 0.0000
- Build successful with provided model!
- [Info] Test Accuracy: 0.897
- [Info] Individual Test Speed: 0.2103862180709839s 1/1000]
- Total time: 210.3862180709839 seconds

Fused Model:
- [Info] Test Accuracy: 0.899 [899:59:42:1000]
- [Info] Individual Test Speed: 0.05818154287338257s 1/1000]
- Total time: 58.18154287338257 seconds



Structure of LPRNet used to determine fusion operations
```
LPRNet(
  (backbone): Sequential(
    (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1))
    (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): MaxPool3d(kernel_size=(1, 3, 3), stride=(1, 1, 1), padding=0, dilation=1, ceil_mode=False)
    (4): small_basic_block(
      (block): Sequential(
        (0): Conv2d(64, 32, kernel_size=(1, 1), stride=(1, 1))
        (1): ReLU()
        (2): Conv2d(32, 32, kernel_size=(3, 1), stride=(1, 1), padding=(1, 0))
        (3): ReLU()
        (4): Conv2d(32, 32, kernel_size=(1, 3), stride=(1, 1), padding=(0, 1))
        (5): ReLU()
        (6): Conv2d(32, 128, kernel_size=(1, 1), stride=(1, 1))
      )
    )
    (5): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (6): ReLU()
    (7): MaxPool3d(kernel_size=(1, 3, 3), stride=(2, 1, 2), padding=0, dilation=1, ceil_mode=False)
    (8): small_basic_block(
      (block): Sequential(
        (0): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1))
        (1): ReLU()
        (2): Conv2d(64, 64, kernel_size=(3, 1), stride=(1, 1), padding=(1, 0))
        (3): ReLU()
        (4): Conv2d(64, 64, kernel_size=(1, 3), stride=(1, 1), padding=(0, 1))
        (5): ReLU()
        (6): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1))
      )
    )
    (9): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (10): ReLU()
    (11): small_basic_block(
      (block): Sequential(
        (0): Conv2d(256, 64, kernel_size=(1, 1), stride=(1, 1))
        (1): ReLU()
        (2): Conv2d(64, 64, kernel_size=(3, 1), stride=(1, 1), padding=(1, 0))
        (3): ReLU()
        (4): Conv2d(64, 64, kernel_size=(1, 3), stride=(1, 1), padding=(0, 1))
        (5): ReLU()
        (6): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1))
      )
    )
    (12): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (13): ReLU()
    (14): MaxPool3d(kernel_size=(1, 3, 3), stride=(4, 1, 2), padding=0, dilation=1, ceil_mode=False)
    (15): Dropout(p=0, inplace=False)
    (16): Conv2d(64, 256, kernel_size=(1, 4), stride=(1, 1))
    (17): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (18): ReLU()
    (19): Dropout(p=0, inplace=False)
    (20): Conv2d(256, 68, kernel_size=(13, 1), stride=(1, 1))
    (21): BatchNorm2d(68, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (22): ReLU()
  )
  (container): Sequential(
    (0): Conv2d(516, 68, kernel_size=(1, 1), stride=(1, 1))
  )
)
```

At the top level, we have Conv2d + BatchNorm + ReLU pattern at (0, 1, 2), (16, 17, 18), finally (20, 21, 22). Inside the small basic blocks we have a Conv2d + ReLU pair that we can also fuse together.