In [1]:
%matplotlib inline
import json
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import timeit as tt

In [2]:
weights = np.array( [[1,2,3,4],[5,6,7,8],[9,10,11,12]], dtype=np.float32)
inputs = np.array([[0.1,0.2,0.3]], dtype=np.float32)
outputs = np.dot(inputs, weights)

print ("Inputs (Shape):\n", inputs.shape)
print ("Output (Shape):\n", outputs.shape)
print ("Weights (Shape):\n", weights.shape)

print ("Inputs:\n", inputs)
print ("Weights:\n", weights)
print ("Output:\n", outputs)

print ()
print ('Input\t\t\t Weights\t\t\t  Output')
print ( inputs[0], '   . \t', weights[0], '\t\t= ', outputs[0])
for i in range(1,3): print ('\t\t\t', weights[i])


Inputs (Shape):
 (1, 3)
Output (Shape):
 (1, 4)
Weights (Shape):
 (3, 4)
Inputs:
 [[0.1 0.2 0.3]]
Weights:
 [[ 1.  2.  3.  4.]
 [ 5.  6.  7.  8.]
 [ 9. 10. 11. 12.]]
Output:
 [[3.8000002 4.4       5.        5.6000004]]

Input			 Weights			  Output
[0.1 0.2 0.3]    . 	 [1. 2. 3. 4.] 		=  [3.8000002 4.4       5.        5.6000004]
			 [5. 6. 7. 8.]
			 [ 9. 10. 11. 12.]


In [3]:
# how its done in dot.sv
def pydot(inputs,weights):
    inputs = inputs[0] # remove outer nesting
    outs = np.zeros(weights.shape[1], dtype=np.float32)
    for i in range(weights.shape[0]): # input length
        for j in range(weights.shape[1]): # output length
            outs[j] = outs[j] + weights[i][j] * inputs[i]
    return outs

# my results
print (pydot(inputs,weights))
# reference results
print (np.dot(inputs, weights)[0])

[3.8000002 4.4       5.        5.6000004]
[3.8000002 4.4       5.        5.6000004]


In [11]:
from pynq import Overlay
from pynq import MMIO
from pynq import allocate

class HwDot():
    def __init__(self, bitstream):
        self.overlay = Overlay(bitstream)
        
        self.dma20x10 = self.overlay.axi_dma_0
        self.dma40x20 = self.overlay.axi_dma_1
        self.dma80x40 = self.overlay.axi_dma_2
        
        self.in20x10 = allocate(shape=(20,), dtype=np.float32)
        self.out20x10 = allocate(shape=(10,), dtype=np.float32)
        
        self.in40x20 = allocate(shape=(40,), dtype=np.float32)
        self.out40x20 = allocate(shape=(20,), dtype=np.float32)
        
        self.in80x40 = allocate(shape=(80,), dtype=np.float32)
        self.out80x40 = allocate(shape=(40,), dtype=np.float32)
        
    def dot20x10(self,inputs):
        np.copyto(self.in20x10, inputs)
        return self._dot(self.dma20x10, self.in20x10, self.out20x10)
        
    def dot40x20(self,inputs):
        np.copyto(self.in40x20, inputs)
        return self._dot(self.dma40x20, self.in40x20, self.out40x20)
    
    def dot80x40(self,inputs):
        np.copyto(self.in80x40, inputs)
        return self._dot(self.dma80x40,self.in80x40, self.out80x40)
    
    def _dot(self, dma, inputs, outputs):
                    
        dma.sendchannel.transfer(inputs)
        dma.recvchannel.transfer(outputs)

        dma.sendchannel.wait()
        dma.recvchannel.wait()
        
        return outputs    

In [12]:
def approx_equal( v0, v1, error = 1E-5):
    results = []
    for (x, y) in zip (v0, v1):
        if (abs(x-y) < error):  
            results.append(True)
        else: results.append(False)
    return results

In [13]:
with open('weights.20x10.json') as f:
    weights20x10 = np.array(json.load(f))
with open('inputs.20x10.json') as f:
    inputs20x10 = json.load(f)

# software
sw_outputs = np.dot( [inputs20x10], weights20x10)

unpipe_dot = HwDot('unpipelined.bit')
unpipe_outputs = unpipe_dot.dot20x10(inputs20x10)

equal = approx_equal(sw_outputs[0], unpipe_outputs)

print ('Equal: ', all(equal))

def py_test():  return pydot( [inputs20x10], weights20x10)
def np_test():  return np.dot( [inputs20x10], weights20x10)
def unpipe_test(): return unpipe_dot.dot20x10(inputs20x10)

print("Timing Python")
time = tt.timeit(py_test, number=1000)
print("Total Time:" + str(time) + " seconds")
print()

print("Timing Numpy")
time = tt.timeit(np_test, number=1000)
print("Total Time:" + str(time) + " seconds")
print()

print("Timing Unpipelined Hardware")
time = tt.timeit(unpipe_test, number=1000)
print("Total Time:" + str(time) + " seconds")
print()

Equal:  True
Timing Python
Total Time:14.906354371923953 seconds

Timing Numpy
Total Time:0.07731177192181349 seconds

Timing Unpipelined Hardware
Total Time:0.7415093616582453 seconds



## Update your Bitstream with a Pipelined Dot, then run this block

In [14]:
with open('weights.20x10.json') as f:
    weights20x10 = np.array(json.load(f))
with open('inputs.20x10.json') as f:
    inputs20x10 = json.load(f)

# software
sw_outputs = np.dot( [inputs20x10], weights20x10)

unpipe_dot = HwDot('bitstream.bit')
unpipe_outputs = unpipe_dot.dot(inputs20x10)

equal = approx_equal(sw_outputs[0], unpipe_outputs)

print ('Equal: ', all(equal))

def py_test():  return pydot( [inputs20x10], weights20x10)
def np_test():  return np.dot( [inputs20x10], weights20x10)
def unpipe_test(): return unpipe_dot.dot20x10(inputs20x10)

print("Timing Python")
time = tt.timeit(py_test, number=1000)
print("Total Time:" + str(time) + " seconds")
print()

print("Timing Numpy")
time = tt.timeit(np_test, number=1000)
print("Total Time:" + str(time) + " seconds")
print()

print("Timing Unpipelined Hardware")
time = tt.timeit(unpipe_test, number=1000)
print("Total Time:" + str(time) + " seconds")
print()

OSError: Bitstream file bitstream.bit does not exist.

## Now update this to time the 20x10, 40x20, and 80x40 Dots.  

## Then you need to estimate how the Hardware compares to NumPy.  And estimate when the Hardware will be faster than NumPy.  