In [122]:
import numpy as np
from abc import ABC, abstractmethod

N = 1   # Batch size
K = 1   # Number of filters (output channels)
C = 3   # Input channels
W = 5   # Output width
H = 5   # Output height
R = 3   # Kernel/filter height
S = 3   # Kernel/filter width

In [123]:
def create_input_tensor(N, C, W, H):
    return np.array(np.random.randint(0, 255, (N, C, W, H), dtype=np.uint8))

def create_kernel_tensor(K, C, R, S):
    # Filter dimensions: (K, C, R, S)
    filter_tensor = np.zeros((K, C, R, S), dtype=np.uint8)
    
    # Fill the filter with identity matrices (1s along the diagonal)
    for k in range(K):
        for c in range(C):
            for i in range(min(R, S)):  # Ensure it's square (R == S)
                filter_tensor[k][c][i][i] = 1
    return np.array((filter_tensor))

In [133]:
# Base class to describe CNN dataflows or loop nests
class data_flow(ABC):
    def __init__(self, computation_unit, accumulator):
        self.computation_unit = computation_unit
        self.accumulator = accumulator

    def compute_parameters(self, ifmap, kernel):
        N, C, W, H = ifmap.shape
        K, _, R, S = kernel.shape
        X = W # Input width
        Y = H # Input height
        # Ignore padding and strides for now
        W = W-S # Output width
        H = H-R # Output height
        return (N, K, C, W, H, R, S, X, Y)

    @abstractmethod
    def execute(self):
        pass

In [134]:
# Baseline CNN representation
class cnn_7d_loop_nest(data_flow):
    def execute(self, ifmap, kernel):
        cycles = 0
        ifmap = np.array(ifmap)
        kernel = np.array(kernel)
        
        N, K, C, W, H, R, S, X, Y = self.compute_parameters(ifmap, kernel)

        # Output dimensions: (N, K, W, H)
        output = np.zeros((N, K, W, H))
        for n in range(N): # Loop over batch size
            for k in range(K): # Loop over output channels (filters)
                for c in range(C): # Loop over input channels
                    for w in range(W): # Loop over output width
                        for h in range(H): # Loop over output height
                            for r in range(R): # Loop over kernel width
                                for s in range(S): # Loop over kernel height
                                    output[n][k][w][h] += ifmap[n][c][w+r][h+s] * kernel[k][c][r][s]
                                    cycles += 1

        return (output, cycles)

In [135]:
# Each individual PE represents a unit similar to a Fusion Unit. 
# In other words, it is composed of 16 Bitbricks and performs 8-bit by 8-bit multiplication
class pe_array:
    def __init__(self, width, height):
        self.width = width
        self.height = height

    # fmap and weight can be shaped depending on algorithm for implementing cnn
    def compute_psum(self, fmap, weight):
        psums = []
        for r in range(self.width):
            for s in range(self.height):
                psums.append(fmap[r][s] * weight[r][s])
        return psums

class accumulator_1d:
    def __init__(self):
        pass
        
    def accumulate(self, psums):
        return sum(psums)

# Sliding window approach
# Assume PE array has dimensions equal to kernel
class single_channel_sliding_window(data_flow):
    def execute(self, ifmap, kernel):
        cycles = 0
        ifmap = np.array(ifmap)
        kernel = np.array(kernel)
        
        N, K, C, W, H, R, S, X, Y = self.compute_parameters(ifmap, kernel)

        # Output dimensions: (N, K, W, H)
        output = np.zeros((N, K, W, H))

        # Perform the convolution operation (iterating over the 7 nested loops)
        for n in range(N): # Loop over batch size
            for k in range(K): # Loop over output channels (filters)
                for c in range(C): # Loop over input channels
                    for w in range(W): # Loop over output width
                        for h in range(H): # Loop over output height
                            # Extract the sliding window of the feature map
                            # sliding window has dimensions equal to kernel
                            rows = ifmap[n][c][w:w+S]
                            sliding_window = []
                            for row in rows:
                                sliding_window.append(list(row[h:h+R]))
                            psums = self.computation_unit.compute_psum(sliding_window, kernel[k][c])
                            accumulated_results = self.accumulator.accumulate(psums)
                            output[n][k][w][h] += accumulated_results

                            cycles += 1
        return (output, cycles)

In [136]:
class accumulator_2d:
    def __init__(self):
        pass
        
    def accumulate(self, psums):
        final_sum = 0

        for psum in psums:
            final_sum += sum(psum)
        return final_sum

# Multi-channel sliding window approach
# Assume PE Array is the same as number of input channels
# Assume PE array has dimensions equal to kernel 
class multi_channel_sliding_window(data_flow):
    def __init__(self, computation_unit, accumulator):
        self.computation_unit = computation_unit
        self.accumulator = accumulator
        
    def execute(self, ifmap, kernel):
        cycles = 0
        ifmap = np.array(ifmap)
        kernel = np.array(kernel)
        
        N, K, C, W, H, R, S, X, Y = self.compute_parameters(ifmap, kernel)

        # Output dimensions: (N, K, W, H)
        output = np.zeros((N, K, W, H))

        # Perform the convolution operation (iterating over the 7 nested loops)
        for n in range(N): # Loop over batch size
            for k in range(K): # Loop over output channels (filters)
                    for w in range(W): # Loop over output width
                        for h in range(H): # Loop over output height
                            pe_input = []
                            # Extract the sliding window of multiple channels the feature map
                            for c in range(C): 
                                rows = ifmap[n][c][w:w+S]
                                sliding_window = []
                                for row in rows:
                                    sliding_window.append(list(row[h:h+R]))
                                pe_input.append(sliding_window)

                            # Spatially unroll input channels or tensor elements
                            # Frankly, this is not necessary since python wont execute this in parallel
                            # but it should visualize the architecture
                            psum_channel_0 = self.computation_unit.compute_psum(pe_input[0], kernel[k][0])
                            psum_channel_1 = self.computation_unit.compute_psum(pe_input[1], kernel[k][1])
                            psum_channel_2 = self.computation_unit.compute_psum(pe_input[2], kernel[k][2])
                            accumulated_results = self.accumulator.accumulate([psum_channel_0, psum_channel_1, psum_channel_2])
                            output[n][k][w][h] = accumulated_results

                            cycles += 1
        return (output, cycles)

In [146]:
# Assume tensor PE elements has same number of input channels
class row_data_reuse(data_router):
    def execute(self, ifmap, kernel):
        cycles = 0
        N_cpe = 1 # num of C-PE arrays
        R = 1 # num of rows
        T = 1 # num of C-PE in array
        Pic = 1 # Parallelism of input channel
        Poc = int(N_cpe/Pic/R)
        ifmap = np.array(ifmap)
        kernel = np.array(kernel)
        
        N, K, C, W, H, R, S, X, Y = self.compute_parameters(ifmap, kernel)

        # Output dimensions: (N, K, W, H)
        output = np.zeros((N, K, W, H), dtype=np.uint8)

        # Perform the convolution operation (iterating over the 7 nested loops)
        for n in range(N): # Loop over batch size
            for x in range (0, X, R): # Loop over ifmap rows with stride R
                for r in range(R): # Loop over each individual row
                    for y in range(0, Y, T): # Loop over ifmap columns with stride T
                        for oc in range(0, K, Poc): # loop over output channels with stride Poc
                            for ic in range(0, C, Pic): # loop over input channels with stride Pic
                                for i in range(R): # loop over kernel width
                                    for j in range(S): # loop over kernel height
                                        for ooc in range(oc):
                                            for iic in range(ic):
                                                output[n][ooc][x+R][y] += kernel[ooc][iic][i][j] * ifmap[icc][x+r+i][y+j]
                                                cycles += 1
        return (output, cycles)

In [147]:
input_tensor = create_input_tensor(N, C, W, H)
print("--Input Tensor--")
print(input_tensor)
kernel_tensor = create_kernel_tensor(K, C, R, S)
print("\n--Kernel Tensor--")
print(kernel_tensor)

# Default 7-D CNN Loop nest
cnn = data_router(None, None)
output, cycles = cnn.execute(input_tensor, kernel_tensor)
print("\n--Output Tensor (CNN loop nest)--")
print(output, cycles)

# Single-channel of kernel fully maps to PE array
accu = accumulator_1d()
pe = pe_array(3,3)
scsw = single_channel_sliding_window(pe, accu)
output, cycles = scsw.execute(input_tensor, kernel_tensor)
print("\n--Output Tensor (single-channel sliding window)--")
print(output, cycles)

# Multiple channels of kernel maps to PE array
accu_unit = accumulator_2d()
pe = pe_array(3,3)
mcsw = multi_channel_sliding_window(pe, accu_unit)
output, cycles = mcsw.execute(input_tensor, kernel_tensor)
print("\n--Output Tensor (multi-channel sliding window)--")
print(output, cycles)

# Row data-reuse
# WIP
rdr = row_data_reuse(None, None)
output, cycles = rdr.execute(input_tensor, kernel_tensor)
print("\n--Output Tensor (Row data-reuse)--")
print(output, cycles)

--Input Tensor--
[[[[134 222 173 229 238]
   [120 212 130 247 234]
   [ 17 161 108  12 154]
   [ 99 212 228 206  18]
   [ 10 217   4 178  35]]

  [[ 74 167 117 150 231]
   [243 217 113  19 226]
   [231  26  13  39 124]
   [196 241 153   4  64]
   [183  71 125 122  27]]

  [[102 160  32  88 245]
   [ 30  36 184  15 230]
   [ 95 187 254  47 203]
   [136 207  18  88  89]
   [217 228  30 216 238]]]]

--Kernel Tensor--
[[[[1 0 0]
   [0 1 0]
   [0 0 1]]

  [[1 0 0]
   [0 1 0]
   [0 0 1]]

  [[1 0 0]
   [0 1 0]
   [0 0 1]]]]
3

--Output Tensor (CNN loop nest)--
[[[[1150. 1074.]
   [1166. 1138.]]]] 108

--Output Tensor (single-channel sliding window)--
[[[[1150. 1074.]
   [1166. 1138.]]]] 12

--Output Tensor (multi-channel sliding window)--
[[[[1150. 1074.]
   [1166. 1138.]]]] 4

--Output Tensor (Row data-reuse)--
[[[[0 0]
   [0 0]]]] 0
