# SETTING

In [1]:
from pynq import allocate
from pynq import Overlay
import numpy as np

#overlay = Overlay('euclidean.bit')
#overlay = Overlay('systolic_cordic.bit')
#overlay = Overlay('array.bit')
overlay = Overlay('horner.bit')



In [2]:
from pynq import Clocks
Clocks.fclk0_mhz = 100
print(f'CPU:   {Clocks.cpu_mhz:.6f}MHz')
print(f'FCLK0: {Clocks.fclk0_mhz:.6f}MHz')

CPU:   1199.988000MHz
FCLK0: 98.887900MHz


In [3]:
def alu(data):
    in_buffer  = allocate(shape=(np.size(data),), dtype=np.int64)
    out_buffer = allocate(shape=(np.size(data),), dtype=np.int8)
    np.copyto(in_buffer, data)
    in_buffer.sync_to_device()
    overlay.axi_dma_0.recvchannel.transfer(out_buffer)
    overlay.axi_dma_0.sendchannel.transfer(in_buffer)
    overlay.axi_dma_0.sendchannel.wait()
    overlay.axi_dma_0.recvchannel.wait()
    out_buffer.sync_from_device()
    return out_buffer

# DEBUG

In [4]:
def interleave_with_one(xx, yy, zz, one=1):
    if not (len(xx) == len(yy) == len(zz)):
        raise ValueError("xx/yy/zz 長度必須一致")
    n = len(xx)
    out = [None] * (4 * n)
    out[0::4] = xx
    out[1::4] = yy
    out[2::4] = zz
    out[3::4] = [one] * n
    return np.array(out)

In [5]:
ran = 1000
res = 20
xx = np.arange(ran / res / 2, ran, ran / res)
yy = np.arange(ran / res / 2, ran, ran / res)
zz = np.arange(ran / res / 2, ran, ran / res)
SCALE = 41
X0    = -20480
Y0    = -16384
Z0    = -17613
S = np.eye(3,dtype=np.uint16)*SCALE
T = np.array([[X0], [Y0], [Z0]])
SRT = np.hstack([S, T])
SRT = SRT.ravel()
vector = interleave_with_one(xx, yy, zz)
input_array = np.concatenate((SRT, vector))

In [6]:
import numpy as np
import time

# 1. 先定義每個軸的刻度 (Ticks)
ticks = np.arange(ran / res / 2, ran, ran / res)

# 2. 使用 meshgrid 生成所有組合
# indexing='ij' 很重要，它確保順序是 x 變化最慢，z 變化最快
# 如果你想要 x 變化最快，可以使用 indexing='xy' (預設) 或調整參數順序
xx_grid, yy_grid, zz_grid = np.meshgrid(ticks, ticks, ticks, indexing='ij')

# 3. 將多維網格展平回一維陣列，以便進行後續的 vector 計算
xx_ = xx_grid.ravel()
yy_ = yy_grid.ravel()
zz_ = zz_grid.ravel()

# 構造 [x, y, z, 1, x, y, z, 1, ...]
ones = np.ones_like(xx_)
# 堆疊成 (N, 4) 的形狀: [[x0, y0, z0, 1], [x1, y1, z1, 1], ...]
stacked = np.column_stack((xx_, yy_, zz_, ones))
vector = stacked.ravel() # 展平成一維

# 驗證輸出
print("前 12 個數字 (3組座標):")
print(vector[:100])

前 12 個數字 (3組座標):
[ 25.  25.  25.   1.  25.  25.  75.   1.  25.  25. 125.   1.  25.  25.
 175.   1.  25.  25. 225.   1.  25.  25. 275.   1.  25.  25. 325.   1.
  25.  25. 375.   1.  25.  25. 425.   1.  25.  25. 475.   1.  25.  25.
 525.   1.  25.  25. 575.   1.  25.  25. 625.   1.  25.  25. 675.   1.
  25.  25. 725.   1.  25.  25. 775.   1.  25.  25. 825.   1.  25.  25.
 875.   1.  25.  25. 925.   1.  25.  25. 975.   1.  25.  75.  25.   1.
  25.  75.  75.   1.  25.  75. 125.   1.  25.  75. 175.   1.  25.  75.
 225.   1.]


In [7]:
i0 = [120,600,100]
i1 = [800,200,200]
i2 = [450,180,250]
# interface 2:
i3 = [150,460,760]
i4 = [880,120,700]
i5 = [500,700,730]
i = np.vstack([i0, i1, i2, i3, i4, i5, i0, i3])
# orientation point:
o0 = [300,800,500]
o1 = [600,0,400]
o2 = [800,500,400]
o = np.vstack([o0, o1, o2])
o = np.hstack([o, np.ones((o.shape[0],1))])
i = np.hstack([i, np.ones((i.shape[0],1))])

In [8]:
vector[:12] = [200,0,600,1,800,0,400,1,800,0,200,1]

In [9]:
vector[:20]

array([200.,   0., 600.,   1., 800.,   0., 400.,   1., 800.,   0., 200.,
         1.,  25.,  25., 175.,   1.,  25.,  25., 225.,   1.])

In [10]:
# compose SRT, o, i, vector

input_array = np.concatenate((SRT, o.ravel(), i.ravel(), vector))
#input_array = np.concatenate((SRT, o.ravel(), i.ravel(), [25,25,25,1]))
groups = input_array.reshape(-1, 4).astype(np.int64)
packed_u64 = (groups[:, 3] << 48) | (groups[:, 2] << 32) | (groups[:, 1] << 16) | groups[:, 0]

In [11]:
groups[:20]

array([[    41,      0,      0, -20480],
       [     0,     41,      0, -16384],
       [     0,      0,     41, -17613],
       [   300,    800,    500,      1],
       [   600,      0,    400,      1],
       [   800,    500,    400,      1],
       [   120,    600,    100,      1],
       [   800,    200,    200,      1],
       [   450,    180,    250,      1],
       [   150,    460,    760,      1],
       [   880,    120,    700,      1],
       [   500,    700,    730,      1],
       [   120,    600,    100,      1],
       [   150,    460,    760,      1],
       [   200,      0,    600,      1],
       [   800,      0,    400,      1],
       [   800,      0,    200,      1],
       [    25,     25,    175,      1],
       [    25,     25,    225,      1],
       [    25,     25,    275,      1]])

In [12]:
resolution = np.array([res**3])
weight = np.array([  90764., -156769., -102156.,   21288.,   10942.,    1842.,
         62610.,  -30465.,   29427., -792988., 1132107., -337532.,
         408508.,  -36278.,  -37493.,   88212.]).astype(np.int64)
packed_u64 = np.concatenate((resolution, weight, packed_u64)).astype(np.int64)

In [13]:
packed_u64[:50]

array([                8000,                90764,              -156769,
                    -102156,                21288,                10942,
                       1842,                62610,               -30465,
                      29427,              -792988,              1132107,
                    -337532,               408508,               -36278,
                     -37493,                88212, -5764607523034234839,
       -4611686018424700928, -4957618588711124992,      283622512787756,
            283192963629656,      283192996397856,      281904512761976,
            282333983277856,      282548730331586,      284739182002326,
            284481461683056,      284610348712436,      281904512761976,
            284739182002326,      284051957088456,      283192963629856,
            282333970170656,      282226597625881,      282441345990681,
            282656094355481,      282870842720281,      283085591085081,
            283300339449881,      283515087814681, 

In [14]:
np.size(packed_u64)

8031

In [15]:
t1 = time.time()
PL_result = alu(packed_u64)
t2 = time.time()
t2 - t1
field = PL_result.astype(np.int8)
field[0:50]

PynqBuffer([1, 2, 3, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2,
            2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2,
            2, 2, 2, 2, 2, 2, 2, 2], dtype=int8)

In [16]:
np.where(field==0)

(array([8000, 8001, 8002, 8003, 8004, 8005, 8006, 8007, 8008, 8009, 8010,
        8011, 8012, 8013, 8014, 8015, 8016, 8017, 8018, 8019, 8020, 8021,
        8022, 8023, 8024, 8025, 8026, 8027, 8028, 8029, 8030]),)

In [17]:
raise Exception("執行已在此手動停止，後續 Cell 將不會執行。")

Exception: 執行已在此手動停止，後續 Cell 將不會執行。

# SERVER

In [None]:
import socket, struct, numpy as np
from pynq import Overlay, allocate

DT = np.int64
BYTES_PER = 8  # np.int64 size

# 優化設定
SOCK_BUF_SIZE = 8 * 1024 * 1024  # 8MB Buffer

with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
    s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
    # 加大接收與傳送緩衝區
    s.setsockopt(socket.SOL_SOCKET, socket.SO_RCVBUF, SOCK_BUF_SIZE)
    s.setsockopt(socket.SOL_SOCKET, socket.SO_SNDBUF, SOCK_BUF_SIZE)
    
    s.bind(("0.0.0.0", 8765))
    s.listen(1)
    print("PYNQ server listening ...")

    # 預配 DMA 緩衝
    cap_elems = 1
    inbuf  = allocate((cap_elems,), DT, cacheable=0)
    outbuf = allocate((cap_elems,), DT, cacheable=0)
    
    # 預先建立 header buffer 避免重複 malloc
    header_buf = bytearray(8) 
    header_view = memoryview(header_buf)

    while True:
        conn, addr = s.accept()
        with conn:
            conn.setsockopt(socket.IPPROTO_TCP, socket.TCP_NODELAY, 1)
            
            while True:
                try:
                    # 1. 讀取長度 (8 bytes)
                    # MSG_WAITALL: 阻塞直到讀滿 8 bytes，無需 loop
                    if conn.recv_into(header_view, 8, socket.MSG_WAITALL) < 8:
                        break
                    
                    nbytes = struct.unpack("<Q", header_view)[0]
                    
                    # 2. 動態調整 DMA Buffer
                    elems = nbytes // BYTES_PER
                    if elems > cap_elems:
                        inbuf.freebuffer()
                        outbuf.freebuffer()
                        inbuf  = allocate((elems,), DT, cacheable=0)
                        outbuf = allocate((elems,), DT, cacheable=0)
                        cap_elems = elems

                    # 3. 直接讀入 DMA (Zero-copy)
                    target_view = inbuf.view(np.uint8)[:nbytes]
                    conn.recv_into(target_view, nbytes, socket.MSG_WAITALL)

                    # 4. 硬體加速
                    out = alu(inbuf[:elems])

                    # 5. 回傳
                    struct.pack_into("<Q", header_buf, 0, out.nbytes)
                    conn.sendall(header_buf) 
                    # 再送資料 (Zero-copy)
                    conn.sendall(out)
                    
                except Exception as e:
                    break

PYNQ server listening ...


# COMPARE

In [None]:
'''
import matplotlib as mpl
import matplotlib.pyplot as plt

# 若你不確定動過什麼樣式，先全部重置
mpl.rcParams.update(mpl.rcParamsDefault)   # 重置成預設風格

# 強制所有圖的底色與儲存底色用白的
mpl.rcParams.update({
    "figure.facecolor": "white",
    "axes.facecolor": "white",
    "savefig.facecolor": "white",
    "savefig.edgecolor": "white"
})
'''

In [None]:
''''
# Jupyter 單格版：只量三維 norm 計算時間，並畫圖（可選擇存檔）
import numpy as np
import matplotlib.pyplot as plt
from time import perf_counter
import csv
import time

# 參數區
sample_sizes = [100,1000,10000,100000,1000000]
LOW, HIGH = -128, 128
REPEAT = 10            # 每個樣本數重複量測次數，取平均
SEED = 12345          # 基礎亂數種子（會加上重複次數索引）
LOGX = True          # True 就用對數 X 軸
DO_SAVE = True       # True 就存 PNG 與 CSV
PNG_PATH = "norm_time.png"
CSV_PATH = "norm_time.csv"

import matplotlib

def time_norm_once(n_samples: int, low: int, high: int, seed: int | None = None):
    rng = np.random.default_rng(seed)
    input_array = rng.integers(low, high, size=n_samples*4+12, dtype=np.int16).astype(np.uint16)
    groups = input_array.reshape(-1, 4).astype(np.int64)
    packed_u64 = (groups[:, 3] << 48) | (groups[:, 2] << 32) | (groups[:, 1] << 16) | groups[:, 0]
    a = input_array[12:].reshape(-1, 4).astype(np.int16)
    b = input_array[:12].reshape(3, 4).astype(np.int16).T

    t0 = time.perf_counter()
    PS = a @ b
    PS_result = np.linalg.norm(PS.reshape(-1, 3), axis=1)
    t1 = time.perf_counter()
    PL_result = alu(packed_u64)
    t2 = time.perf_counter()
    return t1 - t0, t2 - t1

# 量測
n_list, g_list, PS_list, PL_list = [], [], [], []
for idx, n in enumerate(sample_sizes):
    acc1 = 0.0
    acc2 = 0.0
    g_keep = None
    for r in range(REPEAT):
        t1, t2 = time_norm_once(n, LOW, HIGH, seed=SEED + r)
        acc1 += t1
        acc2 += t2
    n_list.append(n)
    PS_list.append(acc1 / REPEAT)
    PL_list.append(acc2 / REPEAT)

# 可選：存檔
if DO_SAVE:
    plt.figure(figsize=(8, 5))
    plt.plot(n_list, PL_list, marker="o", linestyle="-", label="PL computation time")
    plt.plot(n_list, PS_list, marker="o", linestyle="-", label="PS computation time")
    if LOGX:
        plt.xscale("log")
    plt.xlabel("number of samples")
    plt.ylabel("time (s)")
    plt.title("Computation time compare")
    plt.grid(True, which="both", alpha=0.3)
    plt.legend()
    plt.tight_layout()
    plt.savefig(PNG_PATH, dpi=150)
    with open(CSV_PATH, "w", newline="", encoding="utf-8") as f:
        writer = csv.writer(f)
        writer.writerow(["n_samples", "PL_time_s", "PS_time_s"])
        for n, t1, t2 in zip(n_list, PS_list, PL_list):
            writer.writerow([n, f"{t1:.9f}", f"{t2:.9f}"])
    print(f"已存圖檔: {PNG_PATH}")
    print(f"已存 CSV: {CSV_PATH}")
'''