## Installation & Setup

In [1]:
!python3 -m  pip install mlc-ai-cpu -f https://mlc.ai/wheels

Looking in links: https://mlc.ai/wheels
Collecting mlc-ai-cpu
  Downloading https://github.com/mlc-ai/package/releases/download/v0.9.dev0/mlc_ai_cpu-0.17.2-cp310-cp310-manylinux_2_28_x86_64.whl (185.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m185.8/185.8 MB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: mlc-ai-cpu
Successfully installed mlc-ai-cpu-0.17.2


In [2]:
! git clone https://github.com/Yogesh31Hasabe/NCSU-CSC_591-RealTime_AI_and_Machine_Learning_Systems-CourseProject-LPRNet.git

Cloning into 'NCSU-CSC_591-RealTime_AI_and_Machine_Learning_Systems-CourseProject-LPRNet'...
remote: Enumerating objects: 1027, done.[K
remote: Counting objects: 100% (1027/1027), done.[K
remote: Compressing objects: 100% (1022/1022), done.[K
remote: Total 1027 (delta 4), reused 1021 (delta 2), pack-reused 0 (from 0)[K
Receiving objects: 100% (1027/1027), 19.08 MiB | 23.69 MiB/s, done.
Resolving deltas: 100% (4/4), done.


In [3]:
cd NCSU-CSC_591-RealTime_AI_and_Machine_Learning_Systems-CourseProject-LPRNet

/content/NCSU-CSC_591-RealTime_AI_and_Machine_Learning_Systems-CourseProject-LPRNet


In [4]:
import tvm
import torch.nn as nn
from tvm import relay
from tvm.contrib.download import download_testdata
from data.load_data import CHARS, CHARS_DICT, LPRDataLoader
from torchvision.transforms import functional as TF
from types import SimpleNamespace
from PIL import Image, ImageDraw, ImageFont
from model.LPRNet import build_lprnet
from torch.autograd import Variable
from tvm.contrib import graph_executor
import torch.nn.functional as F
import tvm.auto_scheduler as auto_scheduler
from tvm.autotvm.tuner import XGBTuner
from tvm import autotvm
from torch.utils.data import *
from torch import optim
import torch.nn as nn
import numpy as np
import argparse
import torch
import time
import cv2
import os

## Baseline Model Accuracy

In [5]:
! python /content/NCSU-CSC_591-RealTime_AI_and_Machine_Learning_Systems-CourseProject-LPRNet/test_LPRNet.py

Successful to build network!
  lprnet.load_state_dict(torch.load(args.pretrained_model, map_location=torch.device('cpu')))
load pretrained model successful!
[Info] Test Accuracy: 0.902 [902:55:43:1000]
[Info] Test Speed: 0.18756708931922914s 1/1000]


## JIT Trace for NN Module

In [6]:
class small_basic_block(nn.Module):
    def __init__(self, ch_in, ch_out):
        super(small_basic_block, self).__init__()
        self.block = nn.Sequential(
            nn.Conv2d(ch_in, ch_out // 4, kernel_size=1),
            nn.ReLU(),
            nn.Conv2d(ch_out // 4, ch_out // 4, kernel_size=(3, 1), padding=(1, 0)),
            nn.ReLU(),
            nn.Conv2d(ch_out // 4, ch_out // 4, kernel_size=(1, 3), padding=(0, 1)),
            nn.ReLU(),
            nn.Conv2d(ch_out // 4, ch_out, kernel_size=1),
        )
    def forward(self, x):
        return self.block(x)

class LPRNet(nn.Module):
    def __init__(self, lpr_max_len, phase, class_num, dropout_rate):
        super(LPRNet, self).__init__()
        self.phase = phase
        self.lpr_max_len = lpr_max_len
        self.class_num = class_num
        self.backbone = nn.Sequential(
            nn.Conv2d(in_channels=3, out_channels=64, kernel_size=3, stride=1), # 0
            nn.BatchNorm2d(num_features=64),
            nn.ReLU(),  # 2
            nn.MaxPool3d(kernel_size=(1, 3, 3), stride=(1, 1, 1)),
            small_basic_block(ch_in=64, ch_out=128),    # *** 4 ***
            nn.BatchNorm2d(num_features=128),
            nn.ReLU(),  # 6
            nn.MaxPool3d(kernel_size=(1, 3, 3), stride=(2, 1, 2)),
            small_basic_block(ch_in=64, ch_out=256),   # 8
            nn.BatchNorm2d(num_features=256),
            nn.ReLU(),  # 10
            small_basic_block(ch_in=256, ch_out=256),   # *** 11 ***
            nn.BatchNorm2d(num_features=256),   # 12
            nn.ReLU(),
            nn.MaxPool3d(kernel_size=(1, 3, 3), stride=(4, 1, 2)),  # 14
            nn.Dropout(dropout_rate),
            nn.Conv2d(in_channels=64, out_channels=256, kernel_size=(1, 4), stride=1),  # 16
            nn.BatchNorm2d(num_features=256),
            nn.ReLU(),  # 18
            nn.Dropout(dropout_rate),
            nn.Conv2d(in_channels=256, out_channels=class_num, kernel_size=(13, 1), stride=1), # 20
            nn.BatchNorm2d(num_features=class_num),
            nn.ReLU(),  # *** 22 ***
        )
        self.container = nn.Sequential(
            nn.Conv2d(in_channels=448+self.class_num, out_channels=self.class_num, kernel_size=(1, 1), stride=(1, 1)),
        )

    def forward(self, x):
        keep_features = list()
        for i, layer in enumerate(self.backbone.children()):
            x = layer(x)
            if i in [2, 6, 13, 22]: # [2, 4, 8, 11, 22]
                keep_features.append(x)

        global_context = list()
        for i, f in enumerate(keep_features):
            if i in [0, 1]:
                f = nn.AvgPool2d(kernel_size=5, stride=5)(f)
            if i in [2]:
                f = nn.AvgPool2d(kernel_size=(4, 10), stride=(4, 2))(f)
            f_pow = torch.pow(f, 2)
            f_mean = torch.mean(f_pow)
            f = torch.div(f, f_mean)
            global_context.append(f)

        x = torch.cat(global_context, 1)
        x = self.container(x)
        logits = torch.mean(x, dim=2)

        return logits

def build_lprnet(lpr_max_len=8, phase=False, class_num=66, dropout_rate=0.5):

    Net = LPRNet(lpr_max_len, phase, class_num, dropout_rate)

    if phase == "train":
        return Net.train()
    else:
        return Net.eval()

In [7]:
lprnet = build_lprnet(lpr_max_len=8, phase=False, class_num=68, dropout_rate=0.5)
lprnet.load_state_dict(torch.load("./weights/Final_LPRNet_model.pth",  map_location=torch.device('cpu')))
lprnet.eval()

example_input = torch.randn(1, 3, 24, 94)
lprnet_traced_model = torch.jit.trace(lprnet, example_input)
lprnet_traced_model.save("./weights/lprnet_mlc_optimization.pt")

# Load the TorchScript model
scripted_model = torch.jit.load("./weights/lprnet_mlc_optimization.pt")
scripted_model.eval()

  lprnet.load_state_dict(torch.load("./weights/Final_LPRNet_model.pth",  map_location=torch.device('cpu')))


RecursiveScriptModule(
  original_name=LPRNet
  (backbone): RecursiveScriptModule(
    original_name=Sequential
    (0): RecursiveScriptModule(original_name=Conv2d)
    (1): RecursiveScriptModule(original_name=BatchNorm2d)
    (2): RecursiveScriptModule(original_name=ReLU)
    (3): RecursiveScriptModule(original_name=MaxPool3d)
    (4): RecursiveScriptModule(
      original_name=small_basic_block
      (block): RecursiveScriptModule(
        original_name=Sequential
        (0): RecursiveScriptModule(original_name=Conv2d)
        (1): RecursiveScriptModule(original_name=ReLU)
        (2): RecursiveScriptModule(original_name=Conv2d)
        (3): RecursiveScriptModule(original_name=ReLU)
        (4): RecursiveScriptModule(original_name=Conv2d)
        (5): RecursiveScriptModule(original_name=ReLU)
        (6): RecursiveScriptModule(original_name=Conv2d)
      )
    )
    (5): RecursiveScriptModule(original_name=BatchNorm2d)
    (6): RecursiveScriptModule(original_name=ReLU)
    (7): Recu

## Compile into Relay Module

In [8]:
args = {
    'img_size': [94, 24],
    'test_img_dirs': "./data/test",
    'dropout_rate': 0,
    'lpr_max_len': 8,
    'test_batch_size': 100,
    'phase_train': False,
    'num_workers': 2,
    'cuda': False,
    'show': False,
    'pretrained_model': './weights/Final_LPRNet_model.pth'
}

args = SimpleNamespace(**args)

In [9]:
input_shape = (args.test_batch_size, 3, 24, 94)
input_name = "input0"
input_shapes = [(input_name, input_shape)]

# Convert to TVM Relay format
mod, params = relay.frontend.from_pytorch(scripted_model, input_shapes)

# Define the target device
target = "llvm"
dev = tvm.cuda(0) if target == "cuda" else tvm.cpu()

# Compile the model
with tvm.transform.PassContext(opt_level=3):
    lib = relay.build(mod, target=target, params=params)


module = graph_executor.GraphModule(lib["default"](dev))



## Test Function - 1 : Accuracy & Speed

In [10]:
def collate_fn(batch):
    imgs = []
    labels = []
    lengths = []
    for _, sample in enumerate(batch):
        img, label, length = sample
        imgs.append(torch.from_numpy(img))
        labels.extend(label)
        lengths.append(length)
    labels = np.asarray(labels).flatten().astype(np.float32)

    return (torch.stack(imgs, 0), torch.from_numpy(labels), lengths)

def test(module):
    test_img_dirs = os.path.expanduser(args.test_img_dirs)
    test_dataset = LPRDataLoader(test_img_dirs.split(','), args.img_size, args.lpr_max_len)
    Greedy_Decode_Eval(module, test_dataset, args)

def Greedy_Decode_Eval(module, datasets, args):
    # TestNet = Net.eval()
    epoch_size = len(datasets) // args.test_batch_size
    batch_iterator = iter(DataLoader(datasets, args.test_batch_size, shuffle=True, num_workers=args.num_workers, collate_fn=collate_fn))

    Tp = 0
    Tn_1 = 0
    Tn_2 = 0
    t1 = time.time()
    for i in range(epoch_size):
        # load train data
        images, labels, lengths = next(batch_iterator)
        start = 0
        targets = []
        for length in lengths:
            label = labels[start:start+length]
            targets.append(label)
            start += length
        targets = np.array([el.numpy() for el in targets])
        imgs = images.numpy().copy()

        if args.cuda:
            images = Variable(images.cuda())
        else:
            images = Variable(images)

        # forward
        # prebs = Net(images)
        # Set input and run
        module.set_input(input_name, tvm.nd.array(images.numpy()))
        module.run()

        # Get output
        tvm_output = module.get_output(0).asnumpy()
        print("Output shape:", tvm_output.shape)
        prebs = tvm_output
        # greedy decode
        # prebs = prebs.cpu().detach().numpy()
        preb_labels = list()
        for i in range(prebs.shape[0]):
            preb = prebs[i, :, :]
            preb_label = list()
            for j in range(preb.shape[1]):
                preb_label.append(np.argmax(preb[:, j], axis=0))
            no_repeat_blank_label = list()
            pre_c = preb_label[0]
            if pre_c != len(CHARS) - 1:
                no_repeat_blank_label.append(pre_c)
            for c in preb_label: # dropout repeate label and blank label
                if (pre_c == c) or (c == len(CHARS) - 1):
                    if c == len(CHARS) - 1:
                        pre_c = c
                    continue
                no_repeat_blank_label.append(c)
                pre_c = c
            preb_labels.append(no_repeat_blank_label)
        for i, label in enumerate(preb_labels):
            # show image and its predict label
            # if args.show:
                # show(imgs[i], label, targets[i])
            if len(label) != len(targets[i]):
                Tn_1 += 1
                continue
            if (np.asarray(targets[i]) == np.asarray(label)).all():
                Tp += 1
            else:
                Tn_2 += 1
    Acc = Tp * 1.0 / (Tp + Tn_1 + Tn_2)
    print("[Info] Test Accuracy: {} [{}:{}:{}:{}]".format(Acc, Tp, Tn_1, Tn_2, (Tp+Tn_1+Tn_2)))
    t2 = time.time()
    print("[Info] Test Speed: {}s 1/{}]".format((t2 - t1) / len(datasets), len(datasets)))

## MLC Optimization: Manual Optimization

In [11]:
# 1. Canonicalization and Simplification
mod_manual = relay.transform.InferType()(mod)
mod_manual = relay.transform.SimplifyInference()(mod_manual)
mod_manual = relay.transform.CanonicalizeOps()(mod_manual)

# 2. Basic Arithmetic Simplification
mod_manual = relay.transform.FoldConstant()(mod_manual)
mod_manual = relay.transform.CombineParallelConv2D()(mod_manual)

# 3. Layout Transformation (if applicable)
# This can help optimize convolution and other spatial operations
mod_manual = relay.transform.AlterOpLayout()(mod_manual)

# 4. Dead Code Elimination
mod_manual = relay.transform.DeadCodeElimination()(mod_manual)

# 5. Optimize memory usage
mod_manual = relay.transform.EliminateCommonSubexpr()(mod_manual)

In [12]:
# Define the target device
target = "llvm"
dev = tvm.cuda(0) if target == "cuda" else tvm.cpu()

# Compile the model
with tvm.transform.PassContext(opt_level=3):
    lib = relay.build(mod_manual, target=target, params=params)

# Create a graph executor
module = graph_executor.GraphModule(lib["default"](dev))
print('Testing the model after manual optimization \n')
test(module)

Testing the model after manual optimization 

Output shape: (100, 68, 18)
Output shape: (100, 68, 18)
Output shape: (100, 68, 18)
Output shape: (100, 68, 18)
Output shape: (100, 68, 18)
Output shape: (100, 68, 18)
Output shape: (100, 68, 18)
Output shape: (100, 68, 18)
Output shape: (100, 68, 18)
Output shape: (100, 68, 18)
[Info] Test Accuracy: 0.899 [899:61:40:1000]
[Info] Test Speed: 0.03605321073532104s 1/1000]


## MLC Optimization: Auto-Tuning Optimization

In [13]:
number = 10
repeat = 1
min_repeat_ms = 0  # since we're tuning on a CPU, can be set to 0
timeout = 10  # in seconds

# create a TVM runner
runner = autotvm.LocalRunner(
    number=number,
    repeat=repeat,
    timeout=timeout,
    min_repeat_ms=min_repeat_ms,
    enable_cpu_cache_flush=True,
)

In [14]:
tuning_option = {
    "tuner": "xgb",
    "trials": 20,
    "early_stopping": 100,
    "measure_option": autotvm.measure_option(
        builder=autotvm.LocalBuilder(build_func="default"), runner=runner
    ),
    "tuning_records": "lprnet-autotuning.json",
}

# begin by extracting the tasks from the onnx model
tasks = autotvm.task.extract_from_program(mod["main"], target=target, params=params)

# Tune the extracted tasks sequentially.
for i, task in enumerate(tasks):
    prefix = "[Task %2d/%2d] " % (i + 1, len(tasks))

    # choose tuner
    tuner = "xgb"

    # create tuner
    if tuner == "xgb":
        tuner_obj = XGBTuner(task, loss_type="reg")
    else:
        raise ValueError("Invalid tuner: " + tuner)

    tuner_obj.tune(
        n_trial=min(tuning_option["trials"], len(task.config_space)),
        early_stopping=tuning_option["early_stopping"],
        measure_option=tuning_option["measure_option"],
        callbacks=[
            autotvm.callback.progress_bar(tuning_option["trials"], prefix=prefix),
            autotvm.callback.log_to_file(tuning_option["tuning_records"]),
        ],
    )

[Task  1/13]  Current/Best:    3.18/  16.42 GFLOPS | Progress: (20/20) | 54.26 s Done.
[Task  2/13]  Current/Best:   13.89/  18.15 GFLOPS | Progress: (20/20) | 55.05 s Done.
[Task  3/13]  Current/Best:   12.41/  15.34 GFLOPS | Progress: (20/20) | 55.97 s Done.
[Task  4/13]  Current/Best:   18.36/  18.36 GFLOPS | Progress: (20/20) | 58.21 s Done.
[Task  5/13]  Current/Best:    9.60/  12.91 GFLOPS | Progress: (20/20) | 97.45 s Done.
[Task  6/13]  Current/Best:    5.63/  12.28 GFLOPS | Progress: (20/20) | 41.71 s Done.
[Task  7/13]  Current/Best:   12.51/  15.32 GFLOPS | Progress: (20/20) | 83.31 s Done.
[Task  8/13]  Current/Best:   14.20/  19.55 GFLOPS | Progress: (20/20) | 78.59 s Done.
[Task  9/13]  Current/Best:    6.97/  16.60 GFLOPS | Progress: (20/20) | 112.39 s Done.
[Task 10/13]  Current/Best:   11.37/  19.29 GFLOPS | Progress: (20/20) | 100.59 s Done.
[Task 11/13]  Current/Best:    5.43/  19.75 GFLOPS | Progress: (20/20) | 117.34 s Done.
[Task 12/13]  Current/Best:    0.00/  11

In [15]:
with autotvm.apply_history_best(tuning_option["tuning_records"]):
    with tvm.transform.PassContext(opt_level=3, config={}):
        lib = relay.build(mod, target=target, params=params)

dev = tvm.device(str(target), 0)
module = graph_executor.GraphModule(lib["default"](dev))
print('Testing the model after auto optimization \n')
test(module)

Testing the model after auto optimization 

Output shape: (100, 68, 18)
Output shape: (100, 68, 18)
Output shape: (100, 68, 18)
Output shape: (100, 68, 18)
Output shape: (100, 68, 18)
Output shape: (100, 68, 18)
Output shape: (100, 68, 18)
Output shape: (100, 68, 18)
Output shape: (100, 68, 18)
Output shape: (100, 68, 18)
[Info] Test Accuracy: 0.898 [898:61:41:1000]
[Info] Test Speed: 0.0310540030002594s 1/1000]


## Save Module

In [16]:
# Save the module (includes weights/parameters)
module_path = "module.tar"
lib.export_library(module_path)
print(f"Module saved to {module_path}")

Module saved to module.tar


## Test Function - 2 : Size

In [17]:
def print_size_of_model(model, model_name="Model"):
    """
    Save the model temporarily to measure its size on disk and print the size.
    Args:
        model (torch.nn.Module): The model to evaluate.
        model_name (str): Name of the model for reference in output.
    """
    torch.save(model.state_dict(), "temp_delme.p")
    model_size_kb = os.path.getsize("temp_delme.p") / 1e3
    print(f"{model_name} Size (KB): {model_size_kb:.2f}")
    os.remove("temp_delme.p")

In [18]:
def print_size_of_module(module_path, module_name="Module"):
    if os.path.exists(module_path):
        module_size_kb = os.path.getsize(module_path) / 1e3  # Size in KB
        print(f"{module_name} Size (KB): {module_size_kb:.2f}")
    else:
        print(f"{module_name} not found at {module_path}")


In [19]:
print('Size of the model before MLC Optimization:')
print_size_of_model(lprnet, model_name="Original Model")
print("\n")

print('Size of the MLC Optimized Module:')
module_path = "./module.tar"
print_size_of_module(module_path, module_name="TVM Optimized Module")

Size of the model before MLC Optimization:
Original Model Size (KB): 1816.74


Size of the MLC Optimized Module:
TVM Optimized Module Size (KB): 840.69
