In [1]:
import time

import keras
import numpy as np
from keras import backend as K
from keras.datasets import mnist
from keras.models import load_model

import cv2
import pynq.lib.dma
from pynq import MMIO, PL, DefaultHierarchy, Overlay, Xlnk

Using Theano backend.


In [2]:
class FPGA_FULL_CONNECT_NET(DefaultHierarchy):
    def __init__(self, description):
        super().__init__(description)

    def loadweight(self, W, index, IFMDim, OFMDim, PadDim):
        KerDim = W.shape[2]
        IFMCH = W.shape[1]
        OFMCH = W.shape[0]
        batch_size = 0
        kernel_val = W.ravel() * 32768
        kernel = np.append([index, batch_size, KerDim, IFMCH,
                            IFMDim, OFMCH, OFMDim, PadDim], kernel_val)
        kernel = kernel.astype(np.int16)
        in_buffer = Xlnk().cma_array(shape=(kernel.shape[0]), dtype=np.int16)
        out_buffer = Xlnk().cma_array(shape=(kernel.shape[0]), dtype=np.int16)
        for i, v in enumerate(kernel):
            in_buffer[i] = v
        self.axi_dma_0.sendchannel.transfer(in_buffer)
        self.axi_dma_0.recvchannel.transfer(out_buffer)
        self.axi_dma_0.sendchannel.wait()
        self.axi_dma_0.recvchannel.wait()

    def execute(self, test_data, batch_size):
        output_ch = 10
        input_mat = test_data[0:batch_size]
        input_val = input_mat.ravel()
        input_val = np.append([0, batch_size, 1, 1, 28, 10, 1, 0], input_val)
        input_val = input_val.astype(np.int16)
        in_buffer = Xlnk().cma_array(
            shape=(input_val.shape[0]), dtype=np.int16)
        out_buffer = Xlnk().cma_array(
            shape=(8 + output_ch * batch_size), dtype=np.int16)
        for i, v in enumerate(input_val):
            in_buffer[i] = v
        start_time = time.process_time()
        self.axi_dma_0.sendchannel.transfer(in_buffer)
        self.axi_dma_0.recvchannel.transfer(out_buffer)
        self.axi_dma_0.sendchannel.wait()
        self.axi_dma_0.recvchannel.wait()
        end_time = time.process_time()
        print("Elapsed Test Time: ", end_time-start_time)
        output_mat = out_buffer[8:].reshape(batch_size, -1).astype(np.float32)
        for i in range(batch_size):
            output_mat[i] = output_mat[i]/sum(output_mat[i])
        return output_mat

    @staticmethod
    def checkhierarchy(description):
        if 'axi_dma_0' in description['ip']:
            return True
        return False

In [3]:
# input image dimensions
num_classes = 10

# the data, split between train and test sets
(x_train, y_train), (x_test, y_test) = mnist.load_data()

x_train = x_train.reshape(60000, 784)
x_test = x_test.reshape(10000, 784)
x_train = x_train.astype('int16')
x_test = x_test.astype('int16')
print(x_test.shape[0], 'test samples')

# convert class vectors to binary class matrices
y_train = keras.utils.to_categorical(y_train, num_classes)
y_test = keras.utils.to_categorical(y_test, num_classes)

10000 test samples


In [4]:
model = load_model('mnist_mlp_model.h5')

In [5]:
overlay = Overlay('mlp.bit')

In [6]:
weight = model.get_layer(0, 0).get_weights()
weight = np.transpose(weight)
overlay.memory.loadweight(weight, 1, 1, 1, 0)
weight = model.get_layer(0, 1).get_weights()
weight = np.transpose(weight)
overlay.memory.loadweight(weight, 2, 1, 1, 0)
weight = model.get_layer(0, 2).get_weights()
weight = np.transpose(weight)
overlay.memory.loadweight(weight, 3, 1, 1, 0)

In [7]:
%%time
count = 0
batch_size = 100
result = overlay.memory.execute(x_test, batch_size)
for i in range(batch_size):
    if result[i].argmax() == y_test[i].argmax():
        count = count + 1
score = count/batch_size
print('FPGA accuracy:', score)

Elapsed Test Time:  0.35225220899999954
FPGA accuracy: 0.96
CPU times: user 580 ms, sys: 10 ms, total: 590 ms
Wall time: 591 ms


In [12]:
%%time
count = 0
result = model.predict(x_test[0:batch_size])
for i in range(batch_size):
    if result[i].argmax() == y_test[i].argmax():
        count = count + 1
score = count/batch_size
print('Arm accuracy:', score)

Arm accuracy: 0.96
CPU times: user 60 ms, sys: 0 ns, total: 60 ms
Wall time: 56 ms


In [13]:
%%time
score = model.evaluate(x_test[0:batch_size], y_test[0:batch_size])
print('Arm accuracy:', score[1].astype(np.float32))

Arm accuracy: 0.96
CPU times: user 70 ms, sys: 10 ms, total: 80 ms
Wall time: 79.8 ms
