# FCCM 2023 Artifact Evaluation

After the generation and upload of all the bitstreams, you may continue with this jupyter notebook, where it runs the bitstream on board.

You may execute the cells one by one to go through each of the case studies.

In [3]:
%load_ext autoreload
%autoreload 2


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [4]:

from pynq import PL
from pynq import Overlay
import pynq
import numpy as np
import pandas as pd
import random
from pathlib import Path
from MCDMA import MCDMADriver, ChannelTransferDescription
from utils import load_memory

# SOURCE = "ae-prebuilt"
SOURCE = "ae"
# ol = Overlay("../moving_average.xclbin")

# ZHW Encoder

E1/E2

In [None]:

for D in [64]:
    # Downloading bitstream
    ol_name = "/home/xilinx/{}/DS_top_z3_d{}.bit".format(SOURCE, D)
    print("Overlay to load: {}".format(ol_name))
    ol = Overlay(ol_name)
    if D == 64:
        in_file = "/home/xilinx/{}/data/z3data/tc.6.dim.2/m_port.out".format(SOURCE)
        out_file = "/home/xilinx/{}/data/z3data/tc.6.dim.2/s_port.out".format(SOURCE)
    elif D == 32:
        in_file = "/home/xilinx/{}/data/z3data/z3.mem".format(SOURCE)
        out_file = "/home/xilinx/{}/data/z3data/z3_encoded.mem".format(SOURCE)
        
    
    # Load input data
    inp_data = load_memory(in_file, bytes_per_line=8 if D == 64 else 4)
    out_data_golden = load_memory(out_file, bytes_per_line=8 if D == 64 else 4)
    out_data = pynq.allocate(shape=inp_data.shape, dtype=inp_data.dtype)
    
    # DMA transfer/Control
    dma = ol.axi_mcdma_0
    gpio = ol.axi_gpio_0  # gpio for reset

    gpio.channel1.write(0, mask=0xffffffff)  # reset active low
    gpio.channel1.write(1, mask=0xffffffff)  # deassert reset

    # Prepare for receiving
    logging.debug('Preparing for receving data')


    # Currently, sending data flits that are too large hangs the MCDMA engine
    dma.sendchannel.reset()
    dma.recvchannel.reset()
    send_description = [
        ChannelTransferDescription(channel_id=0, array=inp_data, nbytes=inp_data.size * inp_data.itemsize),  # Sent over channel 0
    ]

    recv_description = [
        ChannelTransferDescription(channel_id=0, array=out_data, nbytes=out_data.size * out_data.itemsize)
    ]

    dma.recvchannel.transfer(recv_description)
    dma.sendchannel.transfer(send_description)

    logging.debug('Waiting to receive')
    dma.sendchannel.wait()
    dma.recvchannel.wait()
    # dma.recvchannel.wait()
    out_data.invalidate()
    for i in range(out_data.view(np.uint64).shape[0]):
        if args.verbose:
            print(hex(out_data_golden.view(np.uint64)[i]), hex(out_data.view(np.uint64)[i]))
        assert out_data_golden.view(np.uint64)[i] == out_data.view(np.uint64)[i]

    print('Test passed')

Overlay to load: /home/xilinx/ae/DS_top_z3_d64.bit


# ZHW Decoder

D1/D2

In [None]:
LEN = 16
for D in [32, 64]:
    ol_name = "/home/xilinx/{}/DS_top_z7_d{}/DS_top_z7_d{}.bit".format(SOURCE, D, D)
    print("Overlay to load: {}".format(ol_name))
    

# Moving average

## MA-Div

In [17]:
LEN = 32
for dt in [16,64]:
    for dw in [16,64]:
        def rolling_floor_mean(window):
            return np.floor(np.mean(window)).astype(np.uint64)
        
        ol_name = "/home/xilinx/{}/moving-average-d{}-w{}/moving-average.xclbin".format(SOURCE, dt, dw)
        print("Overlay to load: {}".format(ol_name))
        ol = Overlay(ol_name)
        data = pynq.allocate((LEN, ), np.uint64, target=ol.HP0)
        data[:] = np.arange(LEN)
        mem_out = pynq.allocate((LEN * 3, ), np.uint64, target=ol.HP0)
        starter = ol.starter_1
        handle = starter.start(data, mem_out, LEN)
        handle.wait()
        
        m_max = mem_out[:LEN]
        m_min = mem_out[LEN:2 * LEN]
        m_avg = mem_out[2 * LEN:3 * LEN]
        
        if dt == 16:
            m_max = np.bitwise_and(m_max, 0xffff)
            m_avg = np.bitwise_and(m_avg, 0xffff)
            m_min = np.bitwise_and(m_min, 0xffff)

        print(m_max)
        print(m_min)
        print(m_avg)
    

Overlay to load: /home/xilinx/ae/moving-average-d16-w16/moving-average.xclbin
<pynq.pl_server.embedded_device.EmbeddedDevice object at 0xffff84468b80>
CTRL + CALLABLE
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 31]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 1 1 1 2 2 2 2 3 3 3 4 4 4 5 5 5 5 6 6 6 7 7 7 7 8 8]
Overlay to load: /home/xilinx/ae/moving-average-d16-w64/moving-average.xclbin
<pynq.pl_server.embedded_device.EmbeddedDevice object at 0xffff84468b80>
CTRL + CALLABLE
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 31]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 2 2 2 2 3 3 3 3 4 4 4 4 5 5 5 6]
Overlay to load: /home/xilinx/ae/moving-average-d64-w16/moving-average.xclbin
<pynq.pl_server.embedded_device.EmbeddedDevice object at 0xffff84468b80>
CTRL + CALLABLE
[ 0  1  2  3  4  5  6  7  8  9 10 11 

## MA-Shift

In [None]:
LEN = 16
for dt in [16, 64]:
    for dw in [16, 64]:
        ol_name = "/home/xilinx/{}/moving-average-shift-d{}-w{}/moving-average.xclbin".format(SOURCE, dt, dw)
        print("Overlay to load: {}".format(ol_name))
        ol = Overlay(ol_name)
        data = pynq.allocate((LEN, ), np.uint64, target=ol.HP0)
        data[:] = np.arange(LEN)
        mem_out = pynq.allocate((LEN * 3, ), np.uint64, target=ol.HP0)
        starter = ol.starter_1
        handle = starter.start(data, mem_out, LEN)
        handle.wait()
        
        m_max = mem_out[:LEN]
        m_min = mem_out[LEN:2 * LEN]
        m_avg = mem_out[2 * LEN:3 * LEN]
        
        if dt == 16:
            m_max = np.bitwise_and(m_max, 0xffff)
            m_avg = np.bitwise_and(m_avg, 0xffff)
            m_min = np.bitwise_and(m_min, 0xffff)

        print(m_max, m_min, m_avg)
    

## MA-Div-Sub

In [22]:
LEN = 16
for dt in [16, 64]:
    for dw in [16, 64]:
        ol_name = "/home/xilinx/{}/moving-average-pipe-d{}-w{}/moving-average.xclbin".format(SOURCE, dt, dw)
        print("Overlay to load: {}".format(ol_name))
        ol = Overlay(ol_name)
        data = pynq.allocate((LEN, ), np.uint64, target=ol.HP0)
        data[:] = np.arange(LEN)
        mem_out = pynq.allocate((LEN * 3, ), np.uint64, target=ol.HP0)
        starter = ol.starter_1
        handle = starter.start(data, mem_out, LEN)
        handle.wait()
        
        
        m_max = mem_out[:LEN]
        m_min = mem_out[LEN:2 * LEN]
        m_avg = mem_out[2 * LEN:3 * LEN]
        
        if dt == 16:
            m_max = np.bitwise_and(m_max, 0xffff)
            m_avg = np.bitwise_and(m_avg, 0xffff)
            m_min = np.bitwise_and(m_min, 0xffff)

        print(m_max)
        print(m_min)
        print(m_avg)
    

Overlay to load: /home/xilinx/ae/moving-average-pipe-d16-w16/moving-average.xclbin
<pynq.pl_server.embedded_device.EmbeddedDevice object at 0xffff84859000>
CTRL + CALLABLE
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 1 1 1 2 2 2 2 3 3 3]
Overlay to load: /home/xilinx/ae/moving-average-pipe-d16-w64/moving-average.xclbin
<pynq.pl_server.embedded_device.EmbeddedDevice object at 0xffff84859000>
CTRL + CALLABLE
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1]
Overlay to load: /home/xilinx/ae/moving-average-pipe-d64-w16/moving-average.xclbin
<pynq.pl_server.embedded_device.EmbeddedDevice object at 0xffff84859000>
CTRL + CALLABLE
[ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 15]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
[0 0 1 1 2 2 3 3 4 4 5 5 6 6 7 7]
Overlay to load: /home/xilinx/ae/moving-average-pipe-d64-w64/moving-average.xclbin
<pynq.pl_server.embedded_device.EmbeddedDevice 

# Matrix multiplication

In [17]:
for N in [8, 16]:
    M = N
    ol_name = "/home/xilinx/{}/mm-n{}-d8/mm.xclbin".format(SOURCE, N)
    print("Overlay to load: {}".format(ol_name))
    ol = Overlay(ol_name)
    
    # load inputs
    in_buffer = pynq.allocate(shape=(2*M*M,), dtype=np.uint32, target=ol.HP0)
    out_buffer = pynq.allocate(shape=(M*M,), dtype=np.uint32, target=ol.HP0)
    N1 = N
    N2 = N

    for i in range(0,len(in_buffer)):
        in_buffer[i]=random.randint(1,9) 

    A = np.zeros((M,M))
    for i in range(M*M):
        base_row = int(i*N1/(M*M))
        base_column = int(int(i%M)/N1)
        column=int(base_column*N1) + (int(i%N1))
        row = base_row + int((i%((M*M)/N1))/M)*N1
        A[row][column] = in_buffer[i]


    B = np.zeros((M,M))
    for i in range(M*M):
        base_column = int(i*N2/(M*M))
        base_row = int(int(i%M)/N2)
        row=int(base_row*N2) + (int(i%N2))
        column = base_column + int((i%((M*M)/N2))/M)*N2
        B[row][column] = in_buffer[i+M*M]
        
    in_buffer.sync_to_device()
    starter = ol.mms_1
    handle = starter.start(in_buffer, out_buffer, 2*M*M, M*M)
    handle.wait()
    out_buffer.sync_from_device()
    
    
    D = np.zeros((M,M))
    for i in range(M*M):
        base_row = int(i*N1/(M*M))
        base_column = int(int(i%M)/N2)
        column=int(base_column*N2) + (N2-1-int(i%N2))
        row = base_row + int((i%((M*M)/N1))/M)*N1
        D[row][column] = out_buffer[i]

    
    D_truth = np.matmul(A,B)
    
    print('Same?: ', np.all(D == D_truth))


Overlay to load: /home/xilinx/ae/mm-n8-d8/mm.xclbin
<pynq.pl_server.embedded_device.EmbeddedDevice object at 0xffff84859000>
CTRL + CALLABLE
Same?:  True
Overlay to load: /home/xilinx/ae/mm-n16-d8/mm.xclbin
<pynq.pl_server.embedded_device.EmbeddedDevice object at 0xffff84859000>
CTRL + CALLABLE
Same?:  True


# Divider

The divider example is for showcasing the use of SystemC threads


Overlay to load: /home/xilinx/ae/div-8/divider.xclbin


In [16]:



for n in [8, 16, 32, 64]: # [8, 16, 32, 64]:
    LEN = n
    
    ol_name = "/home/xilinx/{}/div-{}/divider.xclbin".format(SOURCE, n)
    print("Overlay to load: {}".format(ol_name))
    ol = Overlay(ol_name)

    dividend = pynq.allocate((LEN, ), np.uint64, target=ol.HP0)
    dividend[:] = 31720
    divisor  = pynq.allocate((LEN, ), np.uint64, target=ol.HP0)
    divisor[:] = np.arange(1, 31720)[:LEN]
    
    quotient  = pynq.allocate((LEN, ), np.uint64, target=ol.HP0)
    
    dividend.sync_to_device()
    divisor.sync_to_device()
    
    
    starter = ol.mms_1
    handle = starter.start(dividend, divisor, quotient, LEN, LEN)
    handle.wait()
    
    quotient.sync_from_device()
    
    if n == 8:
        dividend=np.bitwise_and(dividend, 0xff)
    elif n == 16:
        dividend=np.bitwise_and(dividend, 0xffff)
    elif n == 32:
        dividend=np.bitwise_and(dividend, 0xffffffff)
        
    
    if n == 8:
        divisor=np.bitwise_and(divisor, 0xff)
    elif n == 16:
        divisor=np.bitwise_and(divisor, 0xffff)
    elif n == 32:
        divisor=np.bitwise_and(divisor, 0xffffffff)
    quotient_golden = dividend//divisor
    
    if n == 8:
        quotient =np.bitwise_and(quotient, 0xff)
    elif n == 16:
        quotient =np.bitwise_and(quotient, 0xffff)
    elif n == 32:
        quotient =np.bitwise_and(quotient, 0xffffffff)
        
    print(np.all(quotient == quotient_golden))
    
    
    

Overlay to load: /home/xilinx/ae/div-8/divider.xclbin
<pynq.pl_server.embedded_device.EmbeddedDevice object at 0xffff84859000>
CTRL + CALLABLE
True
Overlay to load: /home/xilinx/ae/div-16/divider.xclbin
<pynq.pl_server.embedded_device.EmbeddedDevice object at 0xffff84859000>
CTRL + CALLABLE
True
Overlay to load: /home/xilinx/ae/div-32/divider.xclbin
<pynq.pl_server.embedded_device.EmbeddedDevice object at 0xffff84859000>
CTRL + CALLABLE
True
Overlay to load: /home/xilinx/ae/div-64/divider.xclbin
<pynq.pl_server.embedded_device.EmbeddedDevice object at 0xffff84859000>
CTRL + CALLABLE
True
