In [1]:
import tflite_runtime.interpreter as tf
from pynq import Overlay, allocate
import numpy as np
import random
import time
import pickle
import matplotlib.pyplot as plt
import random

# Tensorflow Lite Load Model

In [2]:
tflite_interpreter = tf.Interpreter(model_path='model.tflite')
tflite_interpreter.allocate_tensors()

input_details = tflite_interpreter.get_input_details()
output_details = tflite_interpreter.get_output_details()

print("== Input details ==")
print("name:", input_details[0]['name'])
print("shape:", input_details[0]['shape'])
print("type:", input_details[0]['dtype'])
print("\n== Output details ==")
print("name:", output_details[0]['name'])
print("shape:", output_details[0]['shape'])
print("type:", output_details[0]['dtype'])

# tensor_details = tflite_interpreter.get_tensor_details()

== Input details ==
name: serving_default_input_1:0
shape: [ 1 28 28  1]
type: <class 'numpy.float32'>

== Output details ==
name: StatefulPartitionedCall:0
shape: [ 1 10]
type: <class 'numpy.float32'>


# FPGA CNN Overlay Load

In [3]:
overlay = Overlay('cnn.bit')

In [4]:
dma = overlay.memory.axi_dma_0

# Load Conv Weights

In [5]:
weight = tflite_interpreter.tensor(2)()
weight = np.array(weight * 127.0).astype(np.int16)
print(weight.shape, weight.dtype)
print(max(weight.ravel()), min(weight.ravel()))

OFMCH = weight.shape[0]
IFMCH = weight.shape[3]
KerDim = weight.shape[1]

IFMDim = 1
OFMDim = 1
kernel_val = weight.ravel()
kernel = np.append([1, 0, KerDim, IFMCH, IFMDim, OFMCH, OFMDim], kernel_val)
print(kernel, len(kernel))

in_conv = allocate(shape=(len(kernel),), dtype=np.int16)
out_conv = allocate(shape=(len(kernel),), dtype=np.int16)

np.copyto(in_conv, kernel)

start = time.time()
dma.sendchannel.transfer(in_conv)
dma.recvchannel.transfer(out_conv)
dma.sendchannel.wait()
dma.recvchannel.wait()
end = time.time()
fpga_runtime = end - start
print(fpga_runtime)

print((in_conv == out_conv).all())
in_conv.close()
out_conv.close()

(16, 5, 5, 1) int16
10 -19
[  1   0   5   1   1  16   1  -1   0  -6 -12  -8  -3  -9  -1   0  -5 -13
  -5  -6   4  -4 -10  -1   4   5   3 -14 -16  -3  -3   0  -1 -10   0  -1
   0 -12   0   0   0   0 -17   0  -1   2  -1  -7   0   1   1  -4  -4   0
   0   2  -5  -3   0  -8  -9   1   5   2   0  -8   0  -5  -9   0  -7   1
   0  -1   2 -10   1  -2  -7  -8   0   0  -2 -15   0  -8   0   0   2   0
  -7 -14  -1 -14 -12  -7   0  -6  -3  10  -8  -7  -6   3  -3   3   0 -14
   0  -2  -1   3  -7  -9   2  -8  -5   2  -6   4   0  -1   2 -15  -7   4
   0  -5   0  -5  -4  -5   0   0   1   1   0   0   0   0   0  -4  -5   0
  -6  -1  -1   0 -10   0   0   0  -9   0 -10   0   0  -1   0  -4  -9   0
   1   1   3  -5 -12   0   7  -2  -4  -2  -8   0  -7  -8   0 -10  -8  -4
   1   0  -7  -4  -1  -1  -3   0  -2  -7   0   0   2   0  -2  -9  -5   0
   0   1  -2 -10   0   0   0   1  -6   3 -10  -6   0   0 -16   0   2  -7
   1   1   1   1  -6  -5  -5   0 -15   3 -15   0  -4  -5   1 -11   0   0
 -11   1   1 -15   0  -2

# Load FC Weights

In [6]:
weight = tflite_interpreter.tensor(4)()
weight = np.array(weight * 127.0).astype(np.int16)
print(weight.shape, weight.dtype)
print(max(weight.ravel()), min(weight.ravel()))

OFMCH = weight.shape[0]
IFMCH = weight.shape[1]

KerDim = 1
IFMDim = 1
OFMDim = 1
kernel_val = weight.ravel()
kernel = np.append([2, 0, KerDim, IFMCH, IFMDim, OFMCH, OFMDim], kernel_val)
print(kernel, len(kernel))

in_fc = allocate(shape=(len(kernel),), dtype=np.int16)
out_fc = allocate(shape=(len(kernel),), dtype=np.int16)

np.copyto(in_fc, kernel)

start = time.time()
dma.sendchannel.transfer(in_fc)
dma.recvchannel.transfer(out_fc)
dma.sendchannel.wait()
dma.recvchannel.wait()
end = time.time()
fpga_runtime = end - start
print(fpga_runtime)

print((in_fc == out_fc).all())
in_fc.close()
out_fc.close()

(10, 576) int16
27 -31
[  2   0   1 ...  -6   0 -15] 5767
0.0031185150146484375
True


# Tensorflow Lite Setup

In [7]:
f = open('mnist.pkl', 'rb')
mnist_ds = pickle.load(f, encoding='bytes')
f.close()
(_, _), (x_test, y_test) = mnist_ds

# Load Image from Rand # and forward to DMA

In [8]:
num = 2

in_buffer = allocate(shape=(7 + 28*28,), dtype=np.int16)
out_buffer = allocate(shape=(17,), dtype=np.int16)

img = np.array(x_test[num] / 2).astype(np.int16)
input_val = np.append([0, 1, 1, 1, 28, 10, 1], img.ravel())
np.copyto(in_buffer, input_val)

start = time.time()
dma.sendchannel.transfer(in_buffer)
dma.recvchannel.transfer(out_buffer)
dma.sendchannel.wait()
dma.recvchannel.wait()
end = time.time()

out = out_buffer[7:]
print(out, y_test[num])

fpga_runtime = end - start
print(fpga_runtime)

in_buffer.close()
out_buffer.close()

[   0 5827    0    0 2774   36  922  571 1914    0] 1
0.002873659133911133


In [9]:
import cnn

conv_w = tflite_interpreter.tensor(2)()
conv_w = np.array(conv_w * 127).astype(np.int8)
print(conv_w.shape, conv_w.dtype, max(conv_w.ravel()), min(conv_w.ravel()))

fc_w = tflite_interpreter.tensor(4)()
fc_w = np.array(fc_w * 127).astype(np.int8)
print(fc_w.shape, fc_w.dtype, max(fc_w.ravel()), min(fc_w.ravel()))


img = np.array(x_test[0])
img = (img / 2).astype(np.int16)
img = np.expand_dims(img, axis=2)
print(img.shape, max(img.ravel()), min(img.ravel()), y_test[0])

cnn = cnn.Cnn()
cnn.dtype = np.int32

start = time.time()

l1 = cnn.conv2d(img, conv_w)
l1b = cnn.relu2d(l1, 3)
l2 = cnn.maxpool2d(l1b, 4)
l3 = cnn.flatten(l2)
l4 = cnn.fc(l3, fc_w)
print(list(l4).index(max(l4)) == y_test[0])
end = time.time()

arm_runtime = end - start
print('Runtime # ', arm_runtime)

(16, 5, 5, 1) int8 10 -19
(10, 576) int8 27 -31
(28, 28, 1) 127 0 7
True
Runtime #  1.9316461086273193


In [None]:
in_buffer = allocate(shape=(7 + 28*28,), dtype=np.int16)
out_buffer = allocate(shape=(17,), dtype=np.int16)

acc = 0

start = time.time()
for i in range(len(y_test)):
    img = np.array(x_test[i] / 2).astype(np.int16)
    input_val = np.append([0, 1, 1, 1, 28, 10, 1], img.ravel())
    np.copyto(in_buffer, input_val)

    dma.sendchannel.transfer(in_buffer)
    dma.recvchannel.transfer(out_buffer)
    dma.sendchannel.wait()
    dma.recvchannel.wait()
    
    out = out_buffer[7:]
    out = list(out).index(max(out))
    acc += 1 if out == y_test[i] else 0
    print('\r{}'.format(acc))
    
end = time.time()
fpga_runtime = end - start
print(fpga_runtime)

in_buffer.close()
out_buffer.close()

In [None]:
import cnn

conv_w = tflite_interpreter.tensor(2)()
conv_w = np.array(conv_w * 127).astype(np.int8)

fc_w = tflite_interpreter.tensor(4)()
fc_w = np.array(fc_w * 127).astype(np.int8)


cnn = cnn.Cnn()
cnn.dtype = np.int32

start = time.time()
acc_arm = 0

for i in range(len(y_test)):
    img = np.array(x_test[i] / 2).astype(np.int8)
    img = np.expand_dims(img, 2)
    l1 = cnn.conv2d(img, conv_w)
    l1b = cnn.relu2d(l1, 3)
    l2 = cnn.maxpool2d(l1b, 4)
    l3 = cnn.flatten(l2)
    l4 = cnn.fc(l3, fc_w)
    acc_arm += 1 if y_test[i] == list(l4).index(max(l4)) else 0
    print('\r{}'.format(acc_arm))

end = time.time()
arm_runtime = end - start
print('Runtime # ', arm_runtime)

In [12]:
acc_rate = arm_runtime / fpga_runtime
print('ARM Run Time # 10000 = ', arm_runtime)
print('FPGA Run Time # 10000 = ', fpga_runtime)
print('Total Acceleration', acc_rate)
# print('Accuracy loss (Negative means accuracy increase) = ', (acc_arm - acc) / 100, '%')

ARM Run Time # 10000 =  1.9316461086273193
FPGA Run Time # 10000 =  0.002873659133911133
Total Acceleration 672.1904090267983
