# LAMP FPGA
This notebook executes the LAMP model inference on Ultra96-V2 board

In [None]:
from pynq_dpu import DpuOverlay
import numpy as np
import random
import cv2

from MPTimeSeriesGenerator import MPTimeseriesGenerator
import scipy.io as sio
import numpy as np
from pynq import Clocks
import threading
import time

## Clock Frequency
We set the PL clock frequency to 100MHz and PS clock frequency to 1.2GHz

In [None]:
Clocks.cpu_mhz = 1200.0
Clocks.fclk0_mhz = 100.0
Clocks.fclk1_mhz = 100.0
Clocks.fclk2_mhz = 100.0
Clocks.fclk3_mhz = 100.0

print(f'CPU:   {Clocks.cpu_mhz:.6f}MHz')
print(f'FCLK0: {Clocks.fclk0_mhz:.6f}MHz')
print(f'FCLK1: {Clocks.fclk1_mhz:.6f}MHz')
print(f'FCLK2: {Clocks.fclk2_mhz:.6f}MHz')
print(f'FCLK3: {Clocks.fclk3_mhz:.6f}MHz')


Prepare the DPU overlay and the compiled LAMP model, create dpu kernel

In [None]:
overlay = DpuOverlay("dpu.bit")
overlay.load_model("dpu_lamp_0.elf")

n2cube.dpuOpen()
kernel = n2cube.dpuLoadKernel("lamp_0")

## Prepare input data
Prepare the time series input data using MPTimerSeriesGenerator class, this class takes in a sequence of data-points gathered at equal intervals with other parameters such as window size, stride, sample rate, etc., and generates batches of temporal data used as model input

In [None]:
matrix_profile_window = 256
sample_rate = 20
lookbehind_seconds = 0
lookahead_seconds = 0
subsequence_stride = 256
lookbehind = sample_rate * lookbehind_seconds
num_outputs = 256
lookahead = sample_rate * lookahead_seconds
forward_sequences = lookahead + num_outputs
subsequences_per_input = lookbehind + num_outputs + lookahead
channel_stride = 8
n_input_series = 1
subsequences_per_input = subsequences_per_input // channel_stride
high_weight = 1
low_thresh = -1
high_thresh = 1
batch_size = 128

all_data = sio.loadmat('insect_no_classification.mat')

mp_val = np.array(all_data['mp_val'])
ts_val = np.array(all_data['ts_val'])

valid_gen = MPTimeseriesGenerator(ts_val, mp_val, num_input_timeseries=1, internal_stride=8, num_outputs=256,lookahead=forward_sequences, lookbehind=lookbehind, important_upper_threshold=high_thresh, important_lower_threshold=low_thresh, important_weight=high_weight, length=256, mp_window=256, stride=num_outputs, batch_size)

Having a batch normalization layer before the activation layer reduces the compiled model accuracy, since the tool can not merge these layers; hence, this layer has been removed from the compiled model and the normalized data is computed in the PS and then fed into the model

In [None]:
# batch normalization
epsilon=1e-3

N, C, H, W = data.shape
# mini-batch mean
mean = np.mean(data, axis=(0, 2, 3))
# mini-batch variance
variance = np.mean((data - mean.reshape((1, C, 1, 1))) ** 2, axis=(0, 2, 3))
# normalize
X_hat = (data - mean.reshape((1, C, 1, 1))) * 1.0 / np.sqrt(variance.reshape((1, C, 1, 1)) + epsilon)


## Run the application
In order to increase the DPU kernel utilization and achieve a more efficient scheduling, we use a multithreading model. Each thread runs the model for one batch and moves to the next batch. The model is broken into four kernels, first we run the first kernel on FPGA, store the results, and feed them into the next kernel


In [None]:
def run_dpu_task(index):
  
    task = n2cube.dpuCreateTask(0)
    
    result_index = index
    
    scale_in = n2cube.dpuGetInputTensorScale(task, "conv2d_4_Conv2D", 0)
    scale_out = n2cube.dpuGetOutputTensorScale(task, "conv2d_12_Conv2D", 0)
    
    while index < len(valid_gen):
        
        vg = valid_gen[index]
        
        x_test, y_test = vg
        x_test = np.float32(x_test)

        for i in range(batch_size):

            data = x_test[i][np.newaxis,...]
            feed_data = data / scale_in

            input_len = n2cube.dpuGetInputTensorSize(task, "conv2d_4_Conv2D")
            n2cube.dpuSetInputTensorInHWCFP32(task, "conv2d_4_Conv2D", feed_data, input_len)
            n2cube.dpuSetInputTensorInHWCFP32(task, "conv2d_1_Conv2D", feed_data, input_len)

            n2cube.dpuRunTask(task)

            conv_size = n2cube.dpuGetOutputTensorSize(task, "conv2d_12_Conv2D")
            conv_out = n2cube.dpuGetOutputTensorInHWCFP32(task, "conv2d_12_Conv2D", conv_size)
            
            conv_out = np.reshape(conv_out1, (1, 256, 1, 192))
            
            results[result_index].append(conv_out)
            index += thread_num

    n2cube.dpuDestroyTask(task)

thread_num = 8
thread_all = []
results = [None] * thread_num

for i in range(thread_num):
   
    t1 = threading.Thread(target=run_dpu_task, args=(i))
    thread_all.append(t1)
    
for t in thread_all:
    t.start()
for t in thread_all:
    t.join()
    
n2cube.dpuDestroyKernel(kernel)

The second kernel which is global average pool is implemented in the host CPU

In [None]:
results_avg = [None] * thread_num

for i in range(thread_num):
    for r in results[i]:
        out_scaled = r / scale_out

        global_avg = np.apply_over_axes(np.mean, out_scaled, [1, 2])
        results_avg[i].append(globa_avg)

The third kernel which is the dense layer is implemented on the FPGA, similar to the first layer we use a multithreading model to implement this kernel and gather the results


In [None]:
overlay.load_model("dpu_dense_2.elf")

n2cube.dpuOpen()
kernel = n2cube.dpuLoadKernel("dense_2")

In [None]:
def run_dpu_task_dense(index):
    
    task = n2cube.dpuCreateTask(kernel, 0)
    
    scale_in = n2cube.dpuGetInputTensorScale(task, "dense_1_MatMul", 0)
    scale_out = n2cube.dpuGetOutputTensorScale(task, "dense_1_MatMul", 0)
    
    for res in results_avg[index]: 
    
        feed_input = res / scale_in
    
        input_len = n2cube.dpuGetInputTensorSize(task, "dense_1_MatMul")
        n2cube.dpuSetInputTensorInHWCFP32(task, "dense_1_MatMul", feed_data, input_len)

        n2cube.dpuRunTask(task)

        dense_size = n2cube.dpuGetOutputTensorSize(task, "dense_1_MatMul")
        dense_out = n2cube.dpuGetOutputTensorInHWCFP32(task, "dense_1_MatMul", dense_size)
        
        dense_out = np.reshape(conv_out1, (1, 1, 1, 256))
        
        result_dense[index].append(dense_out)

    
    n2cube.dpuDestroyTask(task)

thread_all = []
result_dense = [None] * thread_num

for i in range(thread_num):
    t1 = threading.Thread(target=run_dpu_task_dense, args=(i))    
    threadAll.append(t1)
    
for t in thread_all:
    t.start()
for t in thread_all:
    t.join()
    
n2cube.dpuDestroyKernel(kernel)


Finally, the last layer (Sigmoid function) is implemented in host and the results are written in a text file

In [None]:
f = open('predict.txt','a+')

for i in range(thread_num):
    for r in result_dense[i]:
        
        out_scaled = r / scale_out
        sigmoid_out = 1/(1 + np.exp(-out_scaled))
        
        np.savetxt(f, sigmoid_out)

f.close()        

In [2]:
#print(np.mean(np.abs((sigmoid_out - y) / sigmoid_out)) * 100)