<a href="https://colab.research.google.com/github/mlc-ai/notebooks/blob/main/2_tensor_program_abstraction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Comparing FP32 vs Int8 latency





Use the Matrix multiplication program from the Machine Learning Compiler (MLC) course to explore the FP32 vs Int8 optimizations. The goal is to explore how the default schedule can be improved by splitting the loops of the comptute block.

Start with breaking up the outer loops in the compute block to increase temporal locality

### Caching

In [4]:
import tvm
from tvm.ir.module import IRModule
from tvm import te
import numpy as np

# Dimensions of the matrices
M = 1024
K = 1024
N = 1024

# Update the flag below for the desired HW target
target = "llvm -mcpu=skylake"
dev = tvm.device(target, 0)

# Iterate over the data types, first create the result dictionaries
result_fp_dict = dict()
result_int_dict = dict()

for dtype in ["float32", "int8"]:
    # Algorithm
    k = te.reduce_axis((0, K), "k")
    A = te.placeholder((M, K), dtype, name="A")
    B = te.placeholder((K, N), dtype, name="B")
    C = te.compute((M, N), lambda m, n: te.sum(A[m, k] * B[k, n], axis=k), name="C")

    # Default schedule
    func = te.create_prim_func([A, B, C])
    func = func.with_attr("global_symbol", "main")
    ir_module = IRModule({"main": func})
    
    func = tvm.build(ir_module, target)  # The module for CPU backends.

    a = tvm.nd.array(np.random.rand(M, K).astype(dtype), dev)
    b = tvm.nd.array(np.random.rand(K, N).astype(dtype), dev)
    c = tvm.nd.array(np.zeros((M, N), dtype=dtype), dev)
    func(a, b, c)

    evaluator = func.time_evaluator(func.entry_name, dev, number=1)
    print("Data Type:", dtype.rjust(7, ' '), "   Baseline latency:  %f" % evaluator(a, b, c).mean)

    for block_size in [8, 16, 32, 64, 128, 256, 512, 1024]:
        # Now change the schedule taking into consideration the cache
        sch = tvm.tir.Schedule(ir_module)
        block_c = sch.get_block("C")
        # Break the outer loops based on the cache block_size
        (y, x, k) = sch.get_loops(block_c)
        yo, yi = sch.split(y, [None, block_size])
        xo, xi = sch.split(x, [None, block_size])

        sch.reorder(yo, xo, k, yi, xi)

        func = tvm.build(sch.mod, target="llvm -mcpu=skylake")  # The module for CPU backends.

        c = tvm.nd.array(np.zeros((M, N), dtype=dtype), dev)
        func(a, b, c)

        evaluator = func.time_evaluator(func.entry_name, dev, number=1)
        result = evaluator(a, b, c).mean
        print("Splitting tensors into", str(block_size).rjust(4, ' '),
              "word blocks: %.4f" % result)
        
             # Create a result dictionary for both FP and int8
        if dtype == "float32":
            result_fp_dict.update({block_size:result})
        elif dtype == "int8":
            result_int_dict.update({block_size:result})
        else:
            print("Something went wrong")
            pass
    
    print("")
          
best = sorted(result_fp_dict.items(), key=lambda x:x[1])[0]
fp32=best[1]
print("Best FP32 is block_size", str(best[0]).rjust(4, ' '),
      "with latency %.4f" % fp32)

best = sorted(result_int_dict.items(), key=lambda x:x[1])[0]
int8=best[1]
print("Best Int8 is block_size", str(best[0]).rjust(4, ' '),
      "with latency %.4f" % int8)
print("Int8 speedup over FP32 is %f%%" % ((np.divide(fp32, int8) - 1.0) * 100.0))
block_size = int(best[0]) # Used in the block below

Data Type: float32    Baseline latency:  1.508090
Splitting tensors into    8 word blocks: 0.3096
Splitting tensors into   16 word blocks: 0.2408
Splitting tensors into   32 word blocks: 0.1989
Splitting tensors into   64 word blocks: 0.1714
Splitting tensors into  128 word blocks: 0.1301
Splitting tensors into  256 word blocks: 0.1541
Splitting tensors into  512 word blocks: 0.1462
Splitting tensors into 1024 word blocks: 0.1367

Data Type:    int8    Baseline latency:  1.672048
Splitting tensors into    8 word blocks: 0.1434
Splitting tensors into   16 word blocks: 0.0943
Splitting tensors into   32 word blocks: 0.0409
Splitting tensors into   64 word blocks: 0.0363
Splitting tensors into  128 word blocks: 0.0335
Splitting tensors into  256 word blocks: 0.0414
Splitting tensors into  512 word blocks: 0.0431
Splitting tensors into 1024 word blocks: 0.0360

Best FP32 is block_size  128 with latency 0.1301
Best Int8 is block_size  128 with latency 0.0335
Int8 speedup over FP32 is 287.95

Lets visualize the best block_size optimizations look like in the TIR code. Note that the optimization includes any arithmetic speedup of int8 vs FP32. This experiment does not separate memory and compute performance improvements.

In [None]:
print("Original Int8 schedule")
ir_module.show()

print("Reordered schedule:")
sch = tvm.tir.Schedule(ir_module)
block_c = sch.get_block("C")
# Get loops surronding the block
(y, x, k) = sch.get_loops(block_c)
yo, yi = sch.split(y, [None, block_size])
xo, xi = sch.split(x, [None, block_size])
sch.reorder(yo, xo, k, yi, xi)

sch.mod.show()

### Splitting the middle loop only
Lets explorewhat happens when we only split the middle loop. This may improve cache locality compared to the above scheme

In [9]:
import tvm
from tvm.ir.module import IRModule
from tvm import te
import numpy as np

# Dimensions of the matrices
M = 1024
K = 1024
N = 1024

# Update the flag below for the desired HW target
target = "llvm -mcpu=skylake"
dev = tvm.device(target, 0)

# Iterate over the data types, first create the result dictionaries
result_fp_dict = dict()
result_int_dict = dict()

for dtype in ["float32", "int8"]:
    # Algorithm
    k = te.reduce_axis((0, K), "k")
    A = te.placeholder((M, K), dtype, name="A")
    B = te.placeholder((K, N), dtype, name="B")
    C = te.compute((M, N), lambda m, n: te.sum(A[m, k] * B[k, n], axis=k), name="C")

    # Default schedule
    func = te.create_prim_func([A, B, C])
    func = func.with_attr("global_symbol", "main")
    ir_module = IRModule({"main": func})
    
    func = tvm.build(ir_module, target)  # The module for CPU backends.

    a = tvm.nd.array(np.random.rand(M, K).astype(dtype), dev)
    b = tvm.nd.array(np.random.rand(K, N).astype(dtype), dev)
    c = tvm.nd.array(np.zeros((M, N), dtype=dtype), dev)
    func(a, b, c)

    evaluator = func.time_evaluator(func.entry_name, dev, number=1)
    print("Data Type:", dtype.rjust(7, ' '), "   Baseline latency:  %f" % evaluator(a, b, c).mean)

    for block_size in [8, 16, 32, 64, 128, 256, 512, 1024]:
        # Now change the schedule taking into consideration the cache
        sch = tvm.tir.Schedule(ir_module)
        block_c = sch.get_block("C")
        # Break the outer loops based on the cache block_size
        (y, x, k) = sch.get_loops(block_c)
        xo, xi = sch.split(x, [None, block_size])

        sch.reorder(y, xo, k, xi)

        func = tvm.build(sch.mod, target="llvm -mcpu=skylake")  # The module for CPU backends.

        c = tvm.nd.array(np.zeros((M, N), dtype=dtype), dev)
        func(a, b, c)

        evaluator = func.time_evaluator(func.entry_name, dev, number=1)
        result = evaluator(a, b, c).mean
        print("Splitting tensors into", str(block_size).rjust(4, ' '),
              "word blocks: %.4f" % result)
        
             # Create a result dictionary for both FP and int8
        if dtype == "float32":
            result_fp_dict.update({block_size:result})
        elif dtype == "int8":
            result_int_dict.update({block_size:result})
        else:
            print("Something went wrong")
            pass

    print("")
    
best = sorted(result_fp_dict.items(), key=lambda x:x[1])[0]
fp32=best[1]
print("Best FP32 is block_size", str(best[0]).rjust(4, ' '),
      "with latency %.4f" % fp32)

best = sorted(result_int_dict.items(), key=lambda x:x[1])[0]
int8=best[1]
print("Best Int8 is block_size", str(best[0]).rjust(4, ' '),
      "with latency %.4f" % int8)
print("Int8 speedup over FP32 is %f%%" % ((np.divide(fp32, int8) - 1.0) * 100.0))
block_size = int(best[0]) # Used in the block below

Data Type: float32    Baseline latency:  1.514097
Splitting tensors into    8 word blocks: 0.6477
Splitting tensors into   16 word blocks: 0.4232
Splitting tensors into   32 word blocks: 0.1479
Splitting tensors into   64 word blocks: 0.1386
Splitting tensors into  128 word blocks: 0.1213
Splitting tensors into  256 word blocks: 0.1083
Splitting tensors into  512 word blocks: 0.0951
Splitting tensors into 1024 word blocks: 0.0896

Data Type:    int8    Baseline latency:  1.685904
Splitting tensors into    8 word blocks: 0.2565
Splitting tensors into   16 word blocks: 0.1558
Splitting tensors into   32 word blocks: 0.0912
Splitting tensors into   64 word blocks: 0.0799
Splitting tensors into  128 word blocks: 0.0712
Splitting tensors into  256 word blocks: 0.0561
Splitting tensors into  512 word blocks: 0.0415
Splitting tensors into 1024 word blocks: 0.0377

Best FP32 is block_size 1024 with latency 0.0896
Best Int8 is block_size 1024 with latency 0.0377
Int8 speedup over FP32 is 137.41

Now lets explore the vectorization capabilities only. For this focus on the innermost loop
### Vectorization

In [10]:
import tvm
from tvm.ir.module import IRModule
from tvm import te
import numpy as np

# Dimensions of the matrices
M = 1024
K = 1024
N = 1024

# Update the flag below for the desired HW target
target = "llvm -mcpu=skylake"
dev = tvm.device(target, 0)

# Iterate over the data types, first create the result dictionaries
result_fp_dict = dict()
result_int_dict = dict()

for dtype in ["float32", "int8"]:
    # Algorithm
    k = te.reduce_axis((0, K), "k")
    A = te.placeholder((M, K), dtype, name="A")
    B = te.placeholder((K, N), dtype, name="B")
    C = te.compute((M, N), lambda m, n: te.sum(A[m, k] * B[k, n], axis=k), name="C")

    # Default schedule
    func = te.create_prim_func([A, B, C])
    func = func.with_attr("global_symbol", "main")
    ir_module = IRModule({"main": func})
    
    func = tvm.build(ir_module, target)  # The module for CPU backends.

    a = tvm.nd.array(np.random.rand(M, K).astype(dtype), dev)
    b = tvm.nd.array(np.random.rand(K, N).astype(dtype), dev)
    c = tvm.nd.array(np.zeros((M, N), dtype=dtype), dev)
    func(a, b, c)

    evaluator = func.time_evaluator(func.entry_name, dev, number=1)
    print("Data Type:", dtype.rjust(7, ' '), "   Baseline latency:  %f" % evaluator(a, b, c).mean)

    for block_size in [8, 16, 32, 64, 128, 256]:
        # Now change the schedule taking into consideration the cache
        sch = tvm.tir.Schedule(ir_module)
        block_c = sch.get_block("C")
        # Break the inner loop based on the SIMD vector unit length
        (y, x, k) = sch.get_loops(block_c)
        ko, ki = sch.split(k, [None, block_size])

        sch.reorder(y, x, ko, ki)

        func = tvm.build(sch.mod, target="llvm -mcpu=skylake")  # The module for CPU backends.

        c = tvm.nd.array(np.zeros((M, N), dtype=dtype), dev)
        func(a, b, c)

        evaluator = func.time_evaluator(func.entry_name, dev, number=1)
        result = evaluator(a, b, c).mean
        print("Splitting tensors into", str(block_size).rjust(4, ' '),
              "word blocks: %.4f" % result)
        
             # Create a result dictionary for both FP and int8
        if dtype == "float32":
            result_fp_dict.update({block_size:result})
        elif dtype == "int8":
            result_int_dict.update({block_size:result})
        else:
            print("Something went wrong")
            pass

    print("")

best = sorted(result_fp_dict.items(), key=lambda x:x[1])[0]
fp32=best[1]
print("Best FP32 is block_size", str(best[0]).rjust(4, ' '),
      "with latency %.4f" % fp32)

best = sorted(result_int_dict.items(), key=lambda x:x[1])[0]
int8=best[1]
print("Best Int8 is block_size", str(best[0]).rjust(4, ' '),
      "with latency %.4f" % int8)
print("Int8 speedup over FP32 is %f%%" % ((np.divide(fp32, int8) - 1.0) * 100.0))
block_size = int(best[0]) # Used in the block below

Data Type: float32    Baseline latency:  1.507966
Splitting tensors into    8 word blocks: 1.4972
Splitting tensors into   16 word blocks: 1.7023
Splitting tensors into   32 word blocks: 1.6099
Splitting tensors into   64 word blocks: 1.5219
Splitting tensors into  128 word blocks: 1.5205
Splitting tensors into  256 word blocks: 1.5367

Data Type:    int8    Baseline latency:  1.687452
Splitting tensors into    8 word blocks: 1.7013
Splitting tensors into   16 word blocks: 1.7037
Splitting tensors into   32 word blocks: 1.6747
Splitting tensors into   64 word blocks: 1.6870
Splitting tensors into  128 word blocks: 1.7167
Splitting tensors into  256 word blocks: 1.7023

Best FP32 is block_size    8 with latency 1.4972
Best Int8 is block_size   32 with latency 1.6747
Int8 speedup over FP32 is -10.596955%


There is no improvement compred to FP32 and little improvement changing block sizes in the inner loop. Thus most likely any improvements through vectorization are negated by the poor cache temporal locality.