# Table of Contents
1. [Dependency](#de)
2. [Get Models & Export to Onnx](#gm)
3. [TVM Config](#cfg)
4. [Get Module](#gmo)
5. [Auto Scheduling](#as)
6. [Multiple Scheduling](#ms)
7. [Evaluate Inference](#in)

# <a name="de">Dependency

In [1]:
import numpy as np
import onnx

import tvm
from tvm import relay, auto_scheduler
from tvm.relay import data_dep_optimization as ddo
import tvm.relay.testing
from tvm.contrib import graph_executor

# <a name="gm">Get Models & Export to Onnx

In [3]:
# !pip install efficientnet_pytorch
from efficientnet_pytorch import EfficientNet
import torch
import torch.nn as nn
import torchvision.models as models
import cv2
import os
import numpy as np
import time
ROOT = "/home/luhsuanwen/project"
def set_efficient_model_ready(model):
    model.set_swish(memory_efficient=False)
    return model

model_name_list = ["resnet50", "resnet101", "efficientnet-b4", "efficientnet-b5", "efficientnet-b6", "efficientnet-b7"]
model_path_list = []
dynamic_model_path_list = []
model_list = []
class_num = 801
for name in model_name_list:
    if name.startswith("res"):
        model = getattr(models, name)(pretrained=True)
        num_input_fts = model.fc.in_features
        model.fc = nn.Linear(num_input_fts, class_num)
        torch.manual_seed(0)
        w = torch.randn(class_num, num_input_fts)
        b = torch.randn(class_num)
        model.fc.bias.data = b
        model.fc.weight.data = w
    else:
        model = EfficientNet.from_pretrained(name)
        model = set_efficient_model_ready(model)
        num_input_fts = model._fc.in_features
        model._fc = nn.Linear(num_input_fts, class_num)
        torch.manual_seed(0)        
        w = torch.randn(class_num, num_input_fts)
        b = torch.randn(class_num)
        model._fc.bias.data = b
        model._fc.weight.data = w
    print(name, w[0][:5])
    model_path = f"{ROOT}/ONNX_MODELS/{name}.onnx"
    dynamic_model_path = f"{ROOT}/ONNX_MODELS/{name}_dynamic.onnx"
    model_path_list.append(model_path)
    dynamic_model_path_list.append(dynamic_model_path)
    model_list.append(model)
    if os.path.exists(model_path):
        continue
    pytorch_to_onnx(model.eval(), model_path, opset_version=10)
    pytorch_to_onnx_dynamic(model.eval(), dynamic_model_path, opset_version=10)


resnet50 tensor([-1.1258, -1.1524, -0.2506, -0.4339,  0.8487])
resnet101 tensor([-1.1258, -1.1524, -0.2506, -0.4339,  0.8487])
Loaded pretrained weights for efficientnet-b4
efficientnet-b4 tensor([-1.1258, -1.1524, -0.2506, -0.4339,  0.8487])
Loaded pretrained weights for efficientnet-b5
efficientnet-b5 tensor([-1.1258, -1.1524, -0.2506, -0.4339,  0.8487])
Loaded pretrained weights for efficientnet-b6
efficientnet-b6 tensor([-1.1258, -1.1524, -0.2506, -0.4339,  0.8487])
Loaded pretrained weights for efficientnet-b7
efficientnet-b7 tensor([-1.1258, -1.1524, -0.2506, -0.4339,  0.8487])


# <a name="cfg">TVM Config

In [4]:
# Define the neural network and compilation target.
# If the target machine supports avx512 instructions, replace the
# "llvm -mcpu=core-avx2" with "llvm -mcpu=skylake-avx512"
class cfg:
    model_name = "resnet101"
    model_path = f"ONNX_MODELS/{model_name}.onnx"
    input_name = "input.1"
    use_sparse = False
    batch_size = 1
    input_shape = (3, 224, 224)
    output_shape = (batch_size, 801)
    dtype = "float32"
    layout = "NCHW"    
    opset_version = 10
    # target = tvm.target.Target("llvm -mcpu=core-avx2")
    target = "llvm"
#     graph_opt_sch_file = "..."
#     log_file = "..."
    json_file = "TVM_FILES/%s-%s-B%d-%s-sparse%d.json" % (model_name, layout, batch_size, target, use_sparse)
#     Set number of threads used for tuning based on the number ofphysical CPU cores on your machine.
#     num_threads = 1
    dev = tvm.device(str(target), 0)

## Export Target Model

In [5]:
if os.path.exists(cfg.model_path):
    print("onnx model already exists")
else:
    model = models.resnet101(pretrained=True)
    num_input_fts = model.fc.in_features
    model.fc = nn.Linear(num_input_fts, 801)

    input_data = torch.randn((cfg.batch_size,)+cfg.input_shape)
    torch.onnx.export(model, input_data, cfg.model_path, opset_version=cfg.opset_version)

onnx model already exists


# <a name="gmo">Get Module

In [5]:
def get_network_from_onnx(
    model_path, 
    input_name="input.1", 
    input_shape=(1, 3, 224, 224), 
    dtype="float32"
):
    onnx_model = onnx.load(model_path)
    shape_dict = {input_name: input_shape}
    mod, params = relay.frontend.from_onnx(onnx_model, shape=shape_dict, dtype=dtype)
    return mod, params
    
def convert_layout(mod):
    desired_layouts = {'nn.conv2d': ['NHWC', 'default']}
    seq = tvm.transform.Sequential([relay.transform.RemoveUnusedFunctions(),
                                    relay.transform.ConvertLayout(desired_layouts)])
    with tvm.transform.PassContext(opt_level=3):
        mod = seq(mod)
    return mod

def get_tvm_module_N_params(
    model_path, 
    input_name="input.1",
    batch_size=1,
    input_shape=(3, 224, 224),
    layout="NHWC", 
    dtype="float32", 
    use_sparse=True
):
    """Get the symbol definition and random weight of a network"""

    data_shape = (batch_size,) + input_shape
    # auto-scheduler prefers NHWC layout
    mod, params = get_network_from_onnx(model_path, input_name, data_shape, dtype)
    if layout == "NHWC":
        mod = convert_layout(mod)
#     net = mod["main"]
#     net = relay.Function(net.params, relay.nn.softmax(net.body), None, net.type_params, net.attrs)
#     mod = tvm.IRModule.from_expr(net)

    if use_sparse:
        from tvm.topi.sparse.utils import convert_model_dense_to_sparse
        mod, params = convert_model_dense_to_sparse(mod, params, bs_r=4, random_params=True)
    return mod, params

In [None]:
# Extract tasks from the network
print("Get module...")
mod, params = get_tvm_module_N_params(
    cfg.model_path, 
    input_name=cfg.input_name,
    batch_size=cfg.batch_size,
    input_shape=cfg.input_shape,
    layout=cfg.layout,
    dtype=cfg.dtype,
    use_sparse=cfg.use_sparse,
)
print("Extract tasks...")
tasks, task_weights = auto_scheduler.extract_tasks(mod["main"], params, cfg.target)

for idx, task in enumerate(tasks):
    print("========== Task %d  (workload key: %s) ==========" % (idx, task.workload_key))
    print(task.compute_dag)

# <a name="as">Auto Scheduling

In [6]:
def run_tuning(tasks, task_weights, json_file, trials=1000, use_sparse=False):
    print("Begin tuning...")
    tuner = auto_scheduler.TaskScheduler(tasks, task_weights)
    tune_option = auto_scheduler.TuningOptions(
        num_measure_trials=trials,  # change this to 20000 to achieve the best performance
        runner=auto_scheduler.LocalRunner(repeat=10, enable_cpu_cache_flush=True),
        measure_callbacks=[auto_scheduler.RecordToFile(json_file)],
    )

    if use_sparse:
        from tvm.topi.sparse.utils import sparse_sketch_rules

        search_policy = [
            auto_scheduler.SketchPolicy(
                task,
                program_cost_model=auto_scheduler.XGBModel(),
                init_search_callbacks=sparse_sketch_rules(),
            )
            for task in tasks
        ]

        tuner.tune(tune_option, search_policy=search_policy)
    else:
        tuner.tune(tune_option)


# We do not run the tuning in our webpage server since it takes too long.
# Uncomment the following line to run it by yourself.

# print(cfg.json_file)
# run_tuning(tasks, task_weights, cfg.json_file, 500)

# <a name="ms">Multiple Scheduling

In [11]:
# Extract tasks from the network
import os
schedule_list = [[model_name, "NCHW", False] for model_name in model_name_list]

def multiple_scheduling(model_name, layout, use_sparse, trials=2000):
    model_path = f"{ROOT}/ONNX_MODELS/{model_name}.onnx"
    json_file = "%s/TVM_FILES/%s-%s-B%d-%s-sparse%d.json" % (ROOT, model_name, layout, 1, "llvm", use_sparse)
    if os.path.exists(json_file):
        return
    mod, params = get_tvm_module_N_params(model_path, layout=layout, use_sparse=use_sparse)
    tasks, task_weights = auto_scheduler.extract_tasks(mod["main"], params, "llvm")

    for idx, task in enumerate(tasks):
        print("========== Task %d  (workload key: %s) ==========" % (idx, task.workload_key))
        print(task.compute_dag)
    run_tuning(tasks, task_weights, json_file, trials)

In [12]:
print(schedule_list)
for m, l, s in schedule_list:
    multiple_scheduling(m, l, s, 2000)

[['resnet50', 'NCHW', False], ['resnet101', 'NCHW', False], ['efficientnet-b4', 'NCHW', False], ['efficientnet-b5', 'NCHW', False], ['efficientnet-b6', 'NCHW', False], ['efficientnet-b7', 'NCHW', False]]
placeholder = PLACEHOLDER [1, 1, 224, 224, 3]
data_pad(i0, i1, i2, i3, i4) = tir.if_then_else(((((i2 >= 3) && (i2 < 227)) && (i3 >= 3)) && (i3 < 227)), placeholder[i0, i1, (i2 - 3), (i3 - 3), i4], 0f)
placeholder = PLACEHOLDER [2, 1, 7, 7, 3, 32]
conv2d_NCHWc(n, oc_chunk, oh, ow, oc_block) += (data_pad[n, floordiv(ic, 3), ((oh*2) + kh), ((ow*2) + kw), floormod(ic, 3)]*placeholder[oc_chunk, floordiv(ic, 3), kh, kw, floormod(ic, 3), oc_block])
placeholder = PLACEHOLDER [1, 2, 1, 1, 32]
T_add(ax0, ax1, ax2, ax3, ax4) = (conv2d_NCHWc[ax0, ax1, ax2, ax3, ax4] + placeholder[ax0, ax1, 0, 0, ax4])
T_relu(ax0, ax1, ax2, ax3, ax4) = max(T_add[ax0, ax1, ax2, ax3, ax4], 0f)

placeholder = PLACEHOLDER [1, 2, 112, 112, 32]
pad_temp(ax0, ax1, ax2, ax3, ax4) = tir.if_then_else(((((ax2 >= 1) && (ax2 < 



|  ID  | Latency (ms) | Speed (GFLOPS) | Trials |
-------------------------------------------------
|    0 |        6.767 |          35.12 |     64 |
|    1 |            - |              - |      0 |
|    2 |            - |              - |      0 |
|    3 |            - |              - |      0 |
|    4 |            - |              - |      0 |
|    5 |            - |              - |      0 |
|    6 |            - |              - |      0 |
|    7 |            - |              - |      0 |
|    8 |            - |              - |      0 |
|    9 |            - |              - |      0 |
|   10 |            - |              - |      0 |
|   11 |            - |              - |      0 |
|   12 |            - |              - |      0 |
|   13 |            - |              - |      0 |
|   14 |            - |              - |      0 |
|   15 |            - |              - |      0 |
|   16 |            - |              - |      0 |
|   17 |            - |              - |      0 |


# <a name="in">Evaluate Inference 

In [8]:
import cv2
import time
def _calculate_dhdw_half(h, w):
    """Calculate difference of h or w in order to get a square """
    if h > w:
        dh_half = int(0.1*h/2)
        dw_half = int((h+2*dh_half-w)/2)
    else:
        dw_half = int(0.1*w/2)
        dh_half = int((w+2*dw_half-h)/2)
    return dh_half, dw_half

def preprocess_tvm(image):
    # 加邊框
    h, w, c = image.shape
    np_img = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    dh_half, dw_half = _calculate_dhdw_half(h, w)
    np_img = cv2.copyMakeBorder(np_img, dh_half, dh_half, dw_half, dw_half, cv2.BORDER_REPLICATE)
    np_img = cv2.resize(np_img, (248, 248))[12:236, 12:236]/255.0
    return np_img.transpose(2, 0, 1).astype(cfg.dtype)[np.newaxis, :]

def tvm_inference(module, img):
    module.set_input(cfg.input_name, tvm.nd.array(img))
    module.run()
    tvm_output = module.get_output(0)
    return tvm_output


In [None]:
n = 10
image_path = "/home/luhsuanwen/project/sample.jpg"

# Compile with the history best
print("Compile...")
with auto_scheduler.ApplyHistoryBest(cfg.json_file):
    with tvm.transform.PassContext(opt_level=3, config={"relay.backend.use_auto_scheduler": True}):
        lib = relay.build(mod, target=cfg.target, params=params)

# Create graph executor
module = graph_executor.GraphModule(lib["default"](cfg.dev))
data_tvm = tvm.nd.array((np.random.uniform(size=(cfg.batch_size, )+cfg.input_shape).astype(cfg.dtype)))
module.set_input(cfg.input_name, data_tvm)

# Evaluate
print("Evaluate inference time cost...")
ftimer = module.module.time_evaluator("run", cfg.dev, repeat=3, min_repeat_ms=500)
prof_res = np.array(ftimer().results) * 1e3  # convert to millisecond
print("Mean inference time (std dev): %.2f ms (%.2f ms)" % (np.mean(prof_res), np.std(prof_res)))


start = time.time()
for i in range(n):
    test_image = cv2.imread(image_path)
    test_image = preprocess_tvm(test_image)
    tvm_inference(module, test_image)
end = time.time()
print(cfg.json_file, (end-start)/n)