In [None]:
import os
import sys
cur_dir = os.getcwd()
aries_path = cur_dir + "/../../../"
sys.path.append(aries_path)
from frontend import *
from IPython import get_ipython

In [None]:
# MTTKRP: D[i0, j0] += A[i0, k0, l0] * B[k0, j0] * C[l0, j0]
I, J, K, L = 8, 128, 32, 128
TI, TJ, TK, TL = 2, 16, 8, 16
GRID_I0, GRID_J0, GRID_K0, GRID_L0 = I//TI, J//TJ, K//TK, L//TL # grid must be a tuple

In [None]:
@task_kernel(external_path="aie1/adf/kernel_mttkrp/aie_int32", para = [TI, TJ, TK, TL])
def kernel_mttkrp(TileA: int32[TI, TK, TL],
                  TileB: int32[TK, TJ],
                  TileC: int32[TL, TJ],
                  TileD: int32[TI, TJ]):
    for i0 in range(0, TI):
        for j0 in range(0, TJ):
            TileD[i0, j0] = int32(0)
            for k0 in range(0, TK):
                for l0 in range(0, TL):
                    TileD[i0, j0] += TileA[i0, k0, l0] * TileB[k0, j0] * TileC[l0, j0]

In [None]:
@task_tile()
def mttkrp(A: int32[-1, -1, -1], B: int32[-1, -1], C: int32[-1, -1], 
           D: int32[-1, -1], GRID_I, GRID_J, GRID_K, GRID_L):
    for i in range(GRID_I):
        for j in range(GRID_J):
            for k in range(GRID_K):
                for l in range(GRID_L):
                    # Compute tile slices for multiple dimensions
                    ti = aries.arange(i*TI, (i+1)*TI)  # I tile range
                    tj = aries.arange(j*TJ, (j+1)*TJ)  # J tile range
                    tk = aries.arange(k*TK, (k+1)*TK)  # K tile range
                    tl = aries.arange(l*TL, (l+1)*TL)  # L tile range

                    L1_A = aries.buffer((TI, TK, TL), "int32")
                    L1_B = aries.buffer((TK, TJ), "int32")
                    L1_C = aries.buffer((TL, TJ), "int32")
                    L1_D = aries.buffer((TI, TJ), "int32")

                    L1_A = aries.load(A, (ti, tk, tl))
                    L1_B = aries.load(B, (tk, tj))
                    L1_C = aries.load(C, (tl, tj))
                    kernel_mttkrp(L1_A, L1_B, L1_C, L1_D)
                    aries.accstore(L1_D, D, (ti, tj))

In [None]:
@task_top()
def top(A: int32[I, K, L], B: int32[K, J], 
        C: int32[L, J], D: int32[I, J]):
    cast_A = aries.cast(A, (-1, -1, -1)) # This is for lowering
    cast_B = aries.cast(B, (-1, -1))
    cast_C = aries.cast(C, (-1, -1))
    cast_D = aries.cast(D, (-1, -1))
    mttkrp_task = mttkrp(cast_A, cast_B, cast_C, cast_D, 
                         GRID_I0, GRID_J0, GRID_K0, GRID_L0)
    return mttkrp_task

In [None]:
# Get the input cells that contains the decorators
cell_codes = get_ipython().user_ns["In"][2:6]
# Join them into one string, with a newline between each cell
all_code = "\n".join(cell_codes)

In [None]:
def mttkrp_sw(A: int32[I, K, L], B: int32[K, J], C: int32[L, J]):
    D = np.zeros((I, J)).astype(np.int32)
    for i0 in range(0, I):
        for j0 in range(0, J):
            for k0 in range(0, K):
                  for l0 in range(0, L):
                      D[i0, j0] += A[i0, k0, l0] * B[k0, j0] * C[l0, j0]
    return D

In [None]:
# Initialize the buffers
np.random.seed(0)
A = np.random.randint(-5, 6, size=(I, K, L), dtype=np.int32)
B = np.random.randint(-5, 6, size=(K, J), dtype=np.int32)
C = np.random.randint(-5, 6, size=(L, J), dtype=np.int32)
D = np.zeros((I, J)).astype(np.int32)

# Execute ARIES on CPU
mttkrp_task = top(A, B, C, D)

# Golden file generation
E = mttkrp_sw(A, B, C)

# Compare the program with golden file
print(np.allclose(D, E))

# Generate files for on-board test
aries.gen_sim([A, B, C, E])

In [None]:
# Specify primitives to optimize hardware design
sch = Schedule(mttkrp_task)

############# Primitives #############
sch.parallel(mttkrp_task, [2, 2, 1, 2]) # AIE Array Parallelism
sch.l2buffer(mttkrp_task, [2, 2, 2, 2]) # L2 buffer data reuse
sch.bufsel(mttkrp_task, [0, 1, 0, 1]) # Select the type of buffer of A, B, C, 1:BRAM; 0:URAM
######################################

sch.to("VCK190")

In [None]:
# Set the project dir and template dir
prj_dir= cur_dir + '/project_mttkrp_dyn'
temp_dir= aries_path + '/templates'
# Generate Initial MLIR and ARIES Opts
sch.build(all_code, prj_dir, temp_dir)