In [29]:
import numpy as np
from abc import ABC, abstractmethod

N = 1   # Batch size
K = 1   # Number of filters (output channels)
C = 3   # Input channels
W = 5   # Output width
H = 5  # Output height
R = 3   # Kernel/filter height
S = 3   # Kernel/filter width

In [22]:
def create_input_tensor(N, C, W, H):
    return np.array(np.random.randint(0, 255, (N, C, W, H), dtype=np.uint8))

def create_kernel_tensor(K, C, R, S):
    # Filter dimensions: (K, C, R, S)
    filter_tensor = np.zeros((K, C, R, S), dtype=np.uint8)
    
    # Fill the filter with identity matrices (1s along the diagonal)
    for k in range(K):
        for c in range(C):
            for i in range(min(R, S)):  # Ensure it's square (R == S)
                filter_tensor[k][c][i][i] = 1
    return np.array((filter_tensor))

In [23]:
# Base class to describe CNN dataflows or loop nests
class data_flow(ABC):
    def __init__(self, computation_unit, accumulator):
        self.computation_unit = computation_unit
        self.accumulator = accumulator

    def compute_parameters(self, ifmap, kernel):
        N, C, W, H = ifmap.shape
        K, _, R, S = kernel.shape
        X = W # Input width
        Y = H # Input height
        # Ignore padding and strides for now
        W = W-S+1 # Output width
        H = H-R+1 # Output height
        return (N, K, C, W, H, R, S, X, Y)

    @abstractmethod
    def execute(self):
        pass

In [4]:
# Baseline CNN representation
class cnn_7d_loop_nest(data_flow):
    def execute(self, ifmap, kernel):
        cycles = 0
        ifmap = np.array(ifmap)
        kernel = np.array(kernel)
        
        N, K, C, W, H, R, S, X, Y = self.compute_parameters(ifmap, kernel)

        # Output dimensions: (N, K, W, H)
        output = np.zeros((N, K, W, H))
        for n in range(N): # Loop over batch size
            for k in range(K): # Loop over output channels (filters)
                for c in range(C): # Loop over input channels
                    for w in range(W): # Loop over output width
                        for h in range(H): # Loop over output height
                            for r in range(R): # Loop over kernel width
                                for s in range(S): # Loop over kernel height
                                    output[n][k][w][h] += ifmap[n][c][w+r][h+s] * kernel[k][c][r][s]
                                    cycles += 1

        return (output, cycles)

In [27]:
# Each individual PE represents a unit similar to a Fusion Unit. 
# In other words, it is composed of 16 Bitbricks and performs 8-bit by 8-bit multiplication
class pe_array:
    def __init__(self, width, height):
        self.width = width
        self.height = height

    # fmap and weight can be shaped depending on algorithm for implementing cnn
    def compute_psum(self, fmap, weight):
        psums = []
        for r in range(self.width):
            for s in range(self.height):
                psums.append(fmap[r][s] * weight[r][s])
        return psums

class accumulator_1d:
    def __init__(self):
        pass
        
    def accumulate(self, psums):
        return sum(psums)

# Sliding window approach
# Assume PE array has dimensions equal to kernel
class single_channel_sliding_window(data_flow):
    def execute(self, ifmap, kernel):
        cycles = 0
        ifmap = np.array(ifmap)
        kernel = np.array(kernel)
        
        N, K, C, W, H, R, S, X, Y = self.compute_parameters(ifmap, kernel)

        # Output dimensions: (N, K, W, H)
        output = np.zeros((N, K, W, H))

        # Perform the convolution operation (iterating over the 7 nested loops)
        for n in range(N): # Loop over batch size
            for k in range(K): # Loop over output channels (filters)
                for c in range(C): # Loop over input channels
                    for w in range(W): # Loop over output width
                        for h in range(H): # Loop over output height
                            # Extract the sliding window of the feature map
                            # sliding window has dimensions equal to kernel
                            rows = ifmap[n][c][w:w+S]
                            sliding_window = []
                            for row in rows:
                                sliding_window.append(list(row[h:h+R]))

                            print(sliding_window)
                            psums = self.computation_unit.compute_psum(sliding_window, kernel[k][c])
                            accumulated_results = self.accumulator.accumulate(psums)
                            output[n][k][w][h] += accumulated_results

                            cycles += 1
        return (output, cycles)

In [25]:
class accumulator_2d:
    def __init__(self):
        pass
        
    def accumulate(self, psums):
        final_sum = 0

        for psum in psums:
            final_sum += sum(psum)
        return final_sum

# Multi-channel sliding window approach
# Assume PE Array is the same as number of input channels
# Assume PE array has dimensions equal to kernel 
class multi_channel_sliding_window(data_flow):
    def __init__(self, computation_unit, accumulator):
        self.computation_unit = computation_unit
        self.accumulator = accumulator
        
    def execute(self, ifmap, kernel):
        cycles = 0
        ifmap = np.array(ifmap)
        kernel = np.array(kernel)
        
        N, K, C, W, H, R, S, X, Y = self.compute_parameters(ifmap, kernel)

        # Output dimensions: (N, K, W, H)
        output = np.zeros((N, K, W, H))

        # Perform the convolution operation (iterating over the 7 nested loops)
        for n in range(N): # Loop over batch size
            for k in range(K): # Loop over output channels (filters)
                    for w in range(W): # Loop over output width
                        for h in range(H): # Loop over output height
                            pe_input = []
                            # Extract the sliding window of multiple channels the feature map
                            for c in range(C): 
                                rows = ifmap[n][c][w:w+S]
                                sliding_window = []
                                for row in rows:
                                    sliding_window.append(list(row[h:h+R]))
                                pe_input.append(sliding_window)

                            # Spatially unroll input channels or tensor elements
                            # Frankly, this is not necessary since python wont execute this in parallel
                            # but it should visualize the architecture
                            psum_channel_0 = self.computation_unit.compute_psum(pe_input[0], kernel[k][0])
                            psum_channel_1 = self.computation_unit.compute_psum(pe_input[1], kernel[k][1])
                            psum_channel_2 = self.computation_unit.compute_psum(pe_input[2], kernel[k][2])
                            accumulated_results = self.accumulator.accumulate([psum_channel_0, psum_channel_1, psum_channel_2])
                            output[n][k][w][h] = accumulated_results

                            cycles += 1
        return (output, cycles)

In [7]:
import math
# Assume tensor PE elements has same number of input channels
class row_data_reuse(data_flow):
    def execute(self, ifmap, kernel):
        cycles = 0
        N_cpe = 1 # num of C-PE arrays
        R = 2 # num of rows
        T = 2 # num of C-PE in array
        Pic = 1 # Parallelism of input channel
        Poc = math.ceil(N_cpe/Pic/R)
        ifmap = np.array(ifmap)
        kernel = np.array(kernel)
        
        N, K, C, W, H, R, S, X, Y = self.compute_parameters(ifmap, kernel)

        # Output dimensions: (N, K, W, H)
        output = np.zeros((N, K, W, H), dtype=np.uint8)

        # Perform the convolution operation (iterating over the 7 nested loops)
        for n in range(N): # Loop over batch size
            for x in range (0, X, R): # Loop over ifmap rows with stride R
                for r in range(R): # Loop over each individual row
                    for y in range(0, Y, T): # Loop over column of input feature map with stride T
                        for oc in range(0, K, Poc): # Loop over output channels with stride Poc
                            for ic in range(0, C, Pic): # Loop over input channels with stride Pic
                                for i in range(R):
                                    if x+r >= X:
                                        break
                                    if y + i >= R:
                                        break
                                    print(ifmap[n][ic][x+r][y+i])
                                print("--------")
                            print("-C-")
        return (output, cycles)

In [30]:
input_tensor = create_input_tensor(N, C, W, H)
print("--Input Tensor--")
print(input_tensor)
kernel_tensor = create_kernel_tensor(K, C, R, S)
print("\n--Kernel Tensor--")
print(kernel_tensor)


# Single-channel of kernel fully maps to PE array
accu = accumulator_1d()
pe = pe_array(3,3)
scsw = single_channel_sliding_window(pe, accu)
output, cycles = scsw.execute(input_tensor, kernel_tensor)
print("\n--Output Tensor (single-channel sliding window)--")
#print(output, cycles)



--Input Tensor--
[[[[247 172 189  23 165]
   [ 18 114 116 173 151]
   [  6  17 152  58  32]
   [ 30 143 157 247 132]
   [233 181 167 203  18]]

  [[ 44 214 252  24 211]
   [215 120   5 173 152]
   [226 147  42   3 132]
   [220 238 237  74 166]
   [202  65  33  95 248]]

  [[ 45 131  94 203  31]
   [  5 180 253 235 233]
   [ 52 230 233 172   5]
   [ 76 115   3   7  44]
   [ 98 229  25 223  57]]]]

--Kernel Tensor--
[[[[1 0 0]
   [0 1 0]
   [0 0 1]]

  [[1 0 0]
   [0 1 0]
   [0 0 1]]

  [[1 0 0]
   [0 1 0]
   [0 0 1]]]]
[[247, 172, 189], [18, 114, 116], [6, 17, 152]]
[[172, 189, 23], [114, 116, 173], [17, 152, 58]]
[[189, 23, 165], [116, 173, 151], [152, 58, 32]]
[[18, 114, 116], [6, 17, 152], [30, 143, 157]]
[[114, 116, 173], [17, 152, 58], [143, 157, 247]]
[[116, 173, 151], [152, 58, 32], [157, 247, 132]]
[[6, 17, 152], [30, 143, 157], [233, 181, 167]]
[[17, 152, 58], [143, 157, 247], [181, 167, 203]]
[[152, 58, 32], [157, 247, 132], [167, 203, 18]]
[[44, 214, 252], [215, 120, 5], [226