In [21]:
import numpy as np
from abc import ABC, abstractmethod

N = 1   # Batch size
K = 1   # Number of filters (output channels)
C = 3   # Input channels
W = 5   # Output width
H = 5   # Output height
R = 3   # Kernel/filter height
S = 3   # Kernel/filter width

In [22]:
def create_input_tensor(N, C, W, H):
    return np.array(np.random.randint(0, 255, (N, C, W, H), dtype=np.uint8))

def create_kernel_tensor(K, C, R, S):
    # Filter dimensions: (K, C, R, S)
    filter_tensor = np.zeros((K, C, R, S), dtype=np.uint8)
    
    # Fill the filter with identity matrices (1s along the diagonal)
    for k in range(K):
        for c in range(C):
            for i in range(min(R, S)):  # Ensure it's square (R == S)
                filter_tensor[k][c][i][i] = 1
    return np.array((filter_tensor))

In [26]:
class data_router(ABC):
    def __init__(self, computation_unit, accumulator):
        self.computation_unit = computation_unit
        self.accumulator = accumulator

    def compute_parameters(self, ifmap, kernel):
        N, C, W, H = ifmap.shape
        K, _, R, S = kernel.shape

        # Ignore padding and strides for now
        W = W-S+1 # Output width
        H = H-R+1 # Output height
        return (N, K, C, W, H, R, S)

    @abstractmethod
    def execute(self):
        pass

# Each individual PE represents a unit similar to a Fusion Unit. 
# In other words, it is composed of 16 Bitbricks and performs 8-bit by 8-bit multiplication
class pe_array:
    def __init__(self, width, height):
        self.width = width
        self.height = height

    # fmap and weight can be shaped depending on algorithm for implementing cnn
    def compute_psum(self, fmap, weight):
        psums = []
        for r in range(self.width):
            for s in range(self.height):
                psums.append(fmap[r][s] * weight[r][s])
        return psums

# Accumulator is dependent on algorithm and shape of pe_array
class accumulator_1d:
    def __init__(self):
        pass
        
    def accumulate(self, psums):
        return sum(psums)

class sliding_window_single_channel(data_router):
    def execute(self, ifmap, kernel):
        cycles = 0
        ifmap = np.array(ifmap)
        kernel = np.array(kernel)
        
        N, K, C, W, H, R, S = self.compute_parameters(ifmap, kernel)

        # Output dimensions: (N, K, W, H)
        output = np.zeros((N, K, W, H), dtype=np.uint8)

        # Perform the convolution operation (iterating over the 7 nested loops)
        for n in range(N): # Loop over batch size
            for k in range(K): # Loop over output channels (filters)
                for c in range(C): # Loop over input channels
                    for w in range(W): # Loop over output width
                        for h in range(H): # Loop over output height
                            # Extract the sliding window of the feature map
                            rows = ifmap[n][c][w:w+S]
                            sliding_window = []
                            for row in rows:
                                sliding_window.append(list(row[h:h+R]))
        
                            psums = self.computation_unit.compute_psum(sliding_window, kernel[k][c])
                            accumulated_results = self.accumulator.accumulate(psums)
                            output[n][k][w][h] = accumulated_results

                            cycles += 1
        return (output, cycles)

In [70]:
class accumulator_2d:
    def __init__(self):
        pass
        
    def accumulate(self, psums):
        final_sum = 0

        for psum in psums:
            final_sum += sum(psum)
        return final_sum
        
# Assume tensor elements has same number of input channels
class sliding_window_multi_channel(data_router):
    def execute(self, ifmap, kernel):
        cycles = 0
        ifmap = np.array(ifmap)
        kernel = np.array(kernel)
        
        N, K, C, W, H, R, S = self.compute_parameters(ifmap, kernel)

        # Output dimensions: (N, K, W, H)
        output = np.zeros((N, K, W, H), dtype=np.uint8)

        # Perform the convolution operation (iterating over the 7 nested loops)
        for n in range(N): # Loop over batch size
            for k in range(K): # Loop over output channels (filters)
                    for w in range(W): # Loop over output width
                        for h in range(H): # Loop over output height
                            pe_input = []
                            # Extract the sliding window of multiple channels the feature map
                            for c in range(C): 
                                rows = ifmap[n][c][w:w+S]
                                sliding_window = []
                                for row in rows:
                                    sliding_window.append(list(row[h:h+R]))
                                pe_input.append(sliding_window)

                            # Spatially unroll input channels or tensor elements
                            # Frankly, this is not necessary since python wont execute this in parallel
                            # but it should visualize the architecture
                            psum_channel_0 = self.computation_unit.compute_psum(pe_input[0], kernel[k][0])
                            psum_channel_1 = self.computation_unit.compute_psum(pe_input[1], kernel[k][1])
                            psum_channel_2 = self.computation_unit.compute_psum(pe_input[2], kernel[k][2])
                            accumulated_results = self.accumulator.accumulate([psum_channel_0, psum_channel_1, psum_channel_2])
                            output[n][k][w][h] = accumulated_results

                            cycles += 1
        return (output, cycles)

In [71]:
input_tensor = create_input_tensor(N, C, W, H)
#print(input_tensor)
kernel_tensor = create_kernel_tensor(K, C, R, S)
#print(kernel_tensor)

# Single-channel of kernel fully maps to PE array
#pe = pe_array(3,3)
#accu = accumulator()
#sliding_window = sliding_window_single_channel(pe, accu)
#output = sliding_window.execute(input_tensor, kernel_tensor)
#print(output)

# Multiple channels of kernel maps to PE array
accu_unit = accumulator_2d()
pe = pe_array(3,3)
sliding_window_multi_channel = sliding_window_multi_channel(pe, accu_unit)
output, cycles = sliding_window_multi_channel.execute(input_tensor, kernel_tensor)
print(output, cycles)

[[[[ 36  48 231]
   [ 52  65 148]
   [105 145 143]]]] 9
