<a href="https://colab.research.google.com/github/mlc-ai/notebooks/blob/main/2_tensor_program_abstraction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Comparing FP32 vs Int8 latency





Use the Matrix multiplication program from the Machine Learning Compiler (MLC) course to explore the FP32 vs Int8 optimizations

In [41]:
import tvm
from tvm.ir.module import IRModule
from tvm import te
import numpy as np

# Dimensions of the matrices
M = 1024
K = 1024
N = 1024

# Update the flag below for the desired HW target
target = "llvm -mcpu=skylake"
dev = tvm.device(target, 0)

# Iterate over the data types
for dtype in ["float32", "int8"]:
    # Algorithm
    k = te.reduce_axis((0, K), "k")
    A = te.placeholder((M, K), dtype, name="A")
    B = te.placeholder((K, N), dtype, name="B")
    C = te.compute((M, N), lambda m, n: te.sum(A[m, k] * B[k, n], axis=k), name="C")

    # Default schedule
    func = te.create_prim_func([A, B, C])
    func = func.with_attr("global_symbol", "main")
    ir_module = IRModule({"main": func})

    func = tvm.build(ir_module, target)  # The module for CPU backends.

    a = tvm.nd.array(np.random.rand(M, K).astype(dtype), dev)
    b = tvm.nd.array(np.random.rand(K, N).astype(dtype), dev)
    c = tvm.nd.array(np.zeros((M, N), dtype=dtype), dev)
    func(a, b, c)

    evaluator = func.time_evaluator(func.entry_name, dev, number=1)
    print("Data Type:", dtype.rjust(7, ' '), "   Baseline latency:  %f" % evaluator(a, b, c).mean)

    for block_size in [8, 16, 32, 64, 128, 256, 512, 1024, 2048]:
        # Now change the schedule taking into consideration the cache
        sch = tvm.tir.Schedule(ir_module)
        block_c = sch.get_block("C")
        # Get loops surronding the block
        (y, x, k) = sch.get_loops(block_c)
        yo, yi = sch.split(y, [None, block_size])
        xo, xi = sch.split(x, [None, block_size])

        sch.reorder(yo, xo, k, yi, xi)

        func = tvm.build(sch.mod, target="llvm -mcpu=skylake")  # The module for CPU backends.

        c = tvm.nd.array(np.zeros((M, N), dtype=dtype), dev)
        func(a, b, c)

        evaluator = func.time_evaluator(func.entry_name, dev, number=1)
        print("Splitting tensors into", str(block_size).rjust(4, ' '),
              "word blocks: %f" % evaluator(a, b, c).mean)
    
    print("\n")

print("Reordered schedule:")
sch.mod.show()

Data Type: float32    Baseline latency:  1.507176
Splitting tensors into    8 word blocks: 0.305563
Splitting tensors into   16 word blocks: 0.237635
Splitting tensors into   32 word blocks: 0.199116
Splitting tensors into   64 word blocks: 0.167480
Splitting tensors into  128 word blocks: 0.128696
Splitting tensors into  256 word blocks: 0.143246
Splitting tensors into  512 word blocks: 0.140489
Splitting tensors into 1024 word blocks: 0.139810
Splitting tensors into 2048 word blocks: 1.309801


Data Type:    int8    Baseline latency:  1.686101
Splitting tensors into    8 word blocks: 0.139921
Splitting tensors into   16 word blocks: 0.092403
Splitting tensors into   32 word blocks: 0.042042
Splitting tensors into   64 word blocks: 0.036159
Splitting tensors into  128 word blocks: 0.032969
Splitting tensors into  256 word blocks: 0.042193
Splitting tensors into  512 word blocks: 0.046457
Splitting tensors into 1024 word blocks: 0.036784
Splitting tensors into 2048 word blocks: 2.69339