<a href="https://colab.research.google.com/github/mlc-ai/notebooks/blob/main/2_tensor_program_abstraction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Comparing FP32 vs Int8 latency





Use the Matrix multiplication program from the Machine Learning Compiler (MLC) course to explore the FP32 vs Int8 optimizations. The goal is to explore how the default schedule can be improved by splitting the inner loop of the comptute block

In [None]:
import tvm
from tvm.ir.module import IRModule
from tvm import te
import numpy as np

# Dimensions of the matrices
M = 1024
K = 1024
N = 1024

# Update the flag below for the desired HW target
target = "llvm -mcpu=skylake"
dev = tvm.device(target, 0)

# Iterate over the data types, first create the result dictionaries
result_fp_dict = dict()
result_int_dict = dict()

for dtype in ["float32", "int8"]:
    # Algorithm
    k = te.reduce_axis((0, K), "k")
    A = te.placeholder((M, K), dtype, name="A")
    B = te.placeholder((K, N), dtype, name="B")
    C = te.compute((M, N), lambda m, n: te.sum(A[m, k] * B[k, n], axis=k), name="C")

    # Default schedule
    func = te.create_prim_func([A, B, C])
    func = func.with_attr("global_symbol", "main")
    ir_module = IRModule({"main": func})
    
    func = tvm.build(ir_module, target)  # The module for CPU backends.

    a = tvm.nd.array(np.random.rand(M, K).astype(dtype), dev)
    b = tvm.nd.array(np.random.rand(K, N).astype(dtype), dev)
    c = tvm.nd.array(np.zeros((M, N), dtype=dtype), dev)
    func(a, b, c)

    evaluator = func.time_evaluator(func.entry_name, dev, number=1)
    print("Data Type:", dtype.rjust(7, ' '), "   Baseline latency:  %f" % evaluator(a, b, c).mean)

    for block_size in [8, 16, 32, 64, 128, 256, 512, 1024, 2048]:
        # Now change the schedule taking into consideration the cache
        sch = tvm.tir.Schedule(ir_module)
        block_c = sch.get_block("C")
        # Get loops surronding the block
        (y, x, k) = sch.get_loops(block_c)
        yo, yi = sch.split(y, [None, block_size])
        xo, xi = sch.split(x, [None, block_size])

        sch.reorder(yo, xo, k, yi, xi)

        func = tvm.build(sch.mod, target="llvm -mcpu=skylake")  # The module for CPU backends.

        c = tvm.nd.array(np.zeros((M, N), dtype=dtype), dev)
        func(a, b, c)

        evaluator = func.time_evaluator(func.entry_name, dev, number=1)
        result = evaluator(a, b, c).mean
        print("Splitting tensors into", str(block_size).rjust(4, ' '),
              "word blocks: %.4f" % result)
        
             # Create a result dictionary for both FP and int8
        if dtype == "float32":
            result_fp_dict.update({block_size:result})
        elif dtype == "int8":
            result_int_dict.update({block_size:result})
        else:
            print("Something went wrong")
            pass
    
    print("\n")

Now get the best schedules for both FP32 and Int8

In [None]:
best = sorted(result_fp_dict.items(), key=lambda x:x[1])[1]
print("Best FP32 is block_size", str(best[0]).rjust(4, ' '),
      "with latency %.4f" % best[1])
best = sorted(result_int_dict.items(), key=lambda x:x[1])[1]
print("Best Int8 is block_size", str(best[0]).rjust(4, ' '),
      "with latency %.4f" % best[1])
block_size = int(best[0])

Lets see what the optimization did for the best schedule compared to the baseline

In [None]:
print("Original Int8 schedule")
ir_module.show()

print("Reordered schedule:")
sch = tvm.tir.Schedule(ir_module)
block_c = sch.get_block("C")
# Get loops surronding the block
(y, x, k) = sch.get_loops(block_c)
yo, yi = sch.split(y, [None, block_size])
xo, xi = sch.split(x, [None, block_size])
sch.reorder(yo, xo, k, yi, xi)

sch.mod.show()