In [1]:
import numpy as np, sys
print("numpy:", np.__version__)
print("numpy file:", np.__file__)
print("python:", sys.executable)

numpy: 1.26.4
numpy file: /usr/local/share/pynq-venv/lib/python3.10/site-packages/numpy/__init__.py
python: /usr/local/share/pynq-venv/bin/python3


In [2]:
from pynq import Overlay

overlay = Overlay('/root/jupyter_notebooks/aru_sample_wrapper.bit')

In [3]:
help(overlay)

Help on Overlay in module pynq.overlay:

<pynq.overlay.Overlay object>
    Default documentation for overlay /root/jupyter_notebooks/aru_sample_wrapper.bit. The following
    attributes are available on this overlay:
    
    IP Blocks
    ----------
    activation_accelerat_0 : pynq.overlay.DefaultIP
    zynq_ultra_ps_e_0    : pynq.overlay.DefaultIP
    
    Hierarchies
    -----------
    None
    
    Interrupts
    ----------
    None
    
    GPIO Outputs
    ------------
    None
    
    Memories
    ------------
    PSDDR                : Memory



In [4]:
acc_ip = overlay.activation_accelerat_0 
help(acc_ip)

Help on DefaultIP in module pynq.overlay object:

class DefaultIP(builtins.object)
 |  DefaultIP(description)
 |  
 |  Driver for an IP without a more specific driver
 |  
 |  This driver wraps an MMIO device and provides a base class
 |  for more specific drivers written later. It also provides
 |  access to GPIO outputs and interrupts inputs via attributes. More specific
 |  drivers should inherit from `DefaultIP` and include a
 |  `bindto` entry containing all of the IP that the driver
 |  should bind to. Subclasses meeting these requirements will
 |  automatically be registered.
 |  
 |  Attributes
 |  ----------
 |  mmio : pynq.MMIO
 |      Underlying MMIO driver for the device
 |  _interrupts : dict
 |      Subset of the PL.interrupt_pins related to this IP
 |  _gpio : dict
 |      Subset of the PL.gpio_dict related to this IP
 |  
 |  Methods defined here:
 |  
 |  __init__(self, description)
 |      Initialize self.  See help(type(self)) for accurate signature.
 |  
 |  read(se

In [13]:
from pynq import allocate
import numpy as np
import time, os

# ====== 1) 准备数据 ======
in0_path = "/root/jupyter_notebooks/test_data/in0_bf16.bin"
in1_path = "/root/jupyter_notebooks/test_data/in1_bf16.bin"

# 假设每个 config 有各自的 golden（按需改路径/命名）
golden_paths = {
    0: "/root/jupyter_notebooks/test_data/golden_out_config_0_bf16.bin",
    1: "/root/jupyter_notebooks/test_data/golden_out_config_1_bf16.bin",
    2: "/root/jupyter_notebooks/test_data/golden_out_config_2_bf16.bin",
    3: "/root/jupyter_notebooks/test_data/golden_out_config_3_bf16.bin",
    4: "/root/jupyter_notebooks/test_data/golden_out_config_4_bf16.bin",
    5: "/root/jupyter_notebooks/test_data/golden_out_config_5_bf16.bin",
    6: "/root/jupyter_notebooks/test_data/golden_out_config_6_bf16.bin",
}
# golden_paths = {
#     0: "/root/jupyter_notebooks/test_data/hls_output_config_0.bin",
#     1: "/root/jupyter_notebooks/test_data/hls_output_config_1.bin",
#     2: "/root/jupyter_notebooks/test_data/hls_output_config_2.bin",
#     3: "/root/jupyter_notebooks/test_data/hls_output_config_3.bin",
#     4: "/root/jupyter_notebooks/test_data/hls_output_config_4.bin",
#     5: "/root/jupyter_notebooks/test_data/hls_output_config_5.bin",
#     6: "/root/jupyter_notebooks/test_data/hls_output_config_6.bin",
# }
# 输入按 bf16（二字节）加载为 uint16 原始位宽
arr0 = np.fromfile(in0_path, dtype=np.uint16)
arr1 = np.fromfile(in1_path, dtype=np.uint16)

# 申请可 DMA 的连续物理内存
buf0 = allocate(shape=arr0.shape, dtype=np.uint16)
buf1 = allocate(shape=arr1.shape, dtype=np.uint16)
out  = allocate(shape=arr0.shape, dtype=np.uint16)  # 假设输出长度与输入相同

# 拷贝并刷新到 DDR
np.copyto(buf0, arr0); buf0.flush()
np.copyto(buf1, arr1); buf1.flush()
out[:] = 0; out.flush()

# 物理地址
pa0 = int(buf0.physical_address)
pa1 = int(buf1.physical_address)
pao = int(out.physical_address)

# ====== 2) 写地址寄存器（与你 rpt 的偏移一致）======
def wr(off, val): acc_ip.write(off, int(val) & 0xFFFFFFFF)
def rd(off): return acc_ip.read(off)

wr(0x10,  pa0 & 0xFFFFFFFF)       # in0 low
wr(0x14, (pa0 >> 32) & 0xFFFFFFFF)# in0 high
wr(0x1C,  pa1 & 0xFFFFFFFF)       # in1 low
wr(0x20, (pa1 >> 32) & 0xFFFFFFFF)# in1 high
wr(0x28,  pao & 0xFFFFFFFF)       # out low
wr(0x2C, (pao >> 32) & 0xFFFFFFFF)# out high

print("in0 = 0x%08X_%08X"%(rd(0x14), rd(0x10)))
print("in1 = 0x%08X_%08X"%(rd(0x20), rd(0x1C)))
print("out = 0x%08X_%08X"%(rd(0x2C), rd(0x28)))

# ====== 3) stage 0：把 in0/in1“加载到 BRAM/片上” ======
wr(0x34, 0)   # stage = 0
wr(0x00, 1)   # ap_start
t0 = time.time()
while (rd(0x00) & 0x2) == 0:  # 等 ap_done
    if time.time() - t0 > 5.0:
        raise TimeoutError("stage 0 超时，检查 IP/时钟/复位")
    time.sleep(0.001)
print("stage-0 done; CTRL=", hex(rd(0x00)))

# ====== 4) stage 1：循环跑各个 config，计时 ======
total_time = 0.0
results = {}
def bf16_to_f32(u16_arr: np.ndarray) -> np.ndarray:
    u32 = (u16_arr.astype(np.uint32) << 16)
    return u32.view(np.float32)

for cfg in [0,1,2,3,4,5,6]:
    # 设置 config
    wr(0x3C, cfg)
    wr(0x34, 1)        # stage = 1（计算）
    # 启动
    t_start = time.time()
    wr(0x00, 1)
    # 等待完成
    while (rd(0x00) & 0x2) == 0:
        if time.time() - t_start > 10.0:
            raise TimeoutError(f"stage 1(config={cfg}) 超时")
        time.sleep(0.001)
    t_end = time.time()
    total_time += (t_end - t_start)
    print(f"[config={cfg}] compute time = {t_end - t_start:.4f} s")

    # ====== 5) stage 2：把结果写回 DDR（如果你的 IP 需要）======
    wr(0x34, 2)   # stage = 2（搬运）
    wr(0x00, 1)
    t2 = time.time()
    while (rd(0x00) & 0x2) == 0:
        if time.time() - t2 > 5.0:
            raise TimeoutError(f"stage 2(config={cfg}) 超时")
        time.sleep(0.001)

    # 失效缓存，读取 out
    out.invalidate()
    out_vec = out.copy()  # 保存这一轮的输出
    results[cfg] = out_vec

    # ====== 6) 对比 golden（若有）======
    gpath = golden_paths.get(cfg, None)
    if gpath and os.path.exists(gpath):
        golden = np.fromfile(gpath, dtype=np.uint16)
        if golden.shape != out_vec.shape:
            print(f"[config={cfg}] GOLDEN 形状不一致: golden={golden.shape}, out={out_vec.shape}")
        else:
            # 对 bf16 最稳妥的方法：逐位比较（uint16）
            same = np.array_equal(golden, out_vec)
            diff = np.count_nonzero(golden != out_vec) # 统计不相等元素的个数
            g_f = bf16_to_f32(golden)
            o_f = bf16_to_f32(out_vec)
            abs_diff = np.abs(g_f - o_f)
            mask = np.isfinite(g_f) & np.isfinite(o_f)
            if mask.any():
                # 为了拿到“原始索引”的最大差，给非有限值设为 -inf 后做 argmax
                finite_abs = abs_diff.copy()
                finite_abs[~mask] = -np.inf
                idx_max = int(np.argmax(finite_abs))
                max_abs_diff = float(abs_diff[idx_max])
            else:
                idx_max = -1
                max_abs_diff = float('nan')
            print(f"[config={cfg}] compare golden: equal(bits)={same}, diff_count={diff}, max_abs_diff={max_abs_diff}")
            if idx_max >= 0:
                print(f"  worst@{idx_max}: golden={g_f[idx_max]}, out={o_f[idx_max]}, abs_diff={abs_diff[idx_max]}")
    else:
        print(f"[config={cfg}] 没有提供 golden 文件（跳过对比）")

print(f"Total compute time (stage1 sum) = {total_time:.4f} s")

# 看看任意一个结果前几项
for cfg in results:
    print(f"config={cfg}, out[:8] =", results[cfg][:8])

in0 = 0x00000000_375C0000
in1 = 0x00000000_375F0000
out = 0x00000000_37A20000
stage-0 done; CTRL= 0x4
[config=0] compute time = 0.0022 s
[config=0] compare golden: equal(bits)=True, diff_count=0, max_abs_diff=0.0
  worst@0: golden=-2.015625, out=-2.015625, abs_diff=0.0
[config=1] compute time = 0.0011 s
[config=1] compare golden: equal(bits)=False, diff_count=32768, max_abs_diff=0.00012493133544921875
  worst@2909: golden=0.00012493133544921875, out=0.0, abs_diff=0.00012493133544921875
[config=2] compute time = 0.0012 s
[config=2] compare golden: equal(bits)=False, diff_count=32768, max_abs_diff=0.00012493133544921875
  worst@2909: golden=0.00012493133544921875, out=0.0, abs_diff=0.00012493133544921875
[config=3] compute time = 0.0011 s
[config=3] compare golden: equal(bits)=False, diff_count=16816, max_abs_diff=0.00390625
  worst@1: golden=0.85546875, out=0.8515625, abs_diff=0.00390625
[config=4] compute time = 0.0011 s
[config=4] compare golden: equal(bits)=False, diff_count=16222, m

In [9]:
from pynq import allocate
import numpy as np
import time, os

# ======================
# 0) 基本参数与检查
# ======================
# ※ 确保已加载 Overlay，并拿到 IP 句柄：
# from pynq import Overlay
# overlay = Overlay('/root/jupyter_notebooks/aru_sample_wrapper2.bit', gen_cache=False)
# acc_ip = overlay.activation_accelerator   # ← 按你的 IP 实例名修改

CONFIG_TO_RUN = 3           # 只跑 config=1（safe softmax，占位会输出全0）
N_FIXED = 32 * 1024          # 你的 HLS 内部写死的处理长度
TIMEOUT_S = 5.0

# ======================
# 1) 准备数据
# ======================
in0_path = "/root/jupyter_notebooks/test_data/in0_bf16.bin"
in1_path = "/root/jupyter_notebooks/test_data/in1_bf16.bin"
# golden_path = f"/root/jupyter_notebooks/test_data/golden_out_config_{CONFIG_TO_RUN}_bf16.bin"
golden_path = f"/root/jupyter_notebooks/test_data/hls_output_config_{CONFIG_TO_RUN}.bin"

# 以 bf16 原样位宽加载到 uint16
arr0 = np.fromfile(in0_path, dtype=np.uint16)
arr1 = np.fromfile(in1_path, dtype=np.uint16)

# 长度对齐到 N_FIXED（你的核内部就是按这个长度处理）
def align_len(a: np.ndarray, N=N_FIXED):
    if a.size == N:
        return a
    b = np.zeros(N, dtype=a.dtype)
    n = min(N, a.size)
    b[:n] = a[:n]
    if a.size != N:
        print(f"[warn] 输入长度 {a.size} 与核固定长度 {N} 不同，已{'截断' if a.size>N else '零填充'}到 {N}")
    return b

arr0 = align_len(arr0, N_FIXED)
arr1 = align_len(arr1, N_FIXED)

# 申请可 DMA 的连续物理内存
buf0 = allocate(shape=arr0.shape, dtype=np.uint16)
buf1 = allocate(shape=arr1.shape, dtype=np.uint16)
out  = allocate(shape=arr0.shape, dtype=np.uint16)

# 拷贝并 flush（CPU->DDR）
np.copyto(buf0, arr0); buf0.flush()
np.copyto(buf1, arr1); buf1.flush()

# 预填充 out，运行后应被覆盖（验证是否真的写回）
out[:] = 0xABCD
out.flush()

# 物理地址
pa0 = int(buf0.physical_address)
pa1 = int(buf1.physical_address)
pao = int(out.physical_address)

# ======================
# 2) 写地址寄存器（与你 rpt 的偏移一致）
# ======================
def wr(off, val): acc_ip.write(off, int(val) & 0xFFFFFFFF)
def rd(off): return acc_ip.read(off)

wr(0x10,  pa0 & 0xFFFFFFFF)        # in0 low
wr(0x14, (pa0 >> 32) & 0xFFFFFFFF) # in0 high
wr(0x1C,  pa1 & 0xFFFFFFFF)        # in1 low
wr(0x20, (pa1 >> 32) & 0xFFFFFFFF) # in1 high
wr(0x28,  pao & 0xFFFFFFFF)        # out low
wr(0x2C, (pao >> 32) & 0xFFFFFFFF) # out high

print("in0 = 0x%08X_%08X"%(rd(0x14), rd(0x10)))
print("in1 = 0x%08X_%08X"%(rd(0x20), rd(0x1C)))
print("out = 0x%08X_%08X"%(rd(0x2C), rd(0x28)))
print("CTRL =", hex(rd(0x00)))

def run_stage(stage_val, config_val=None, timeout_s=TIMEOUT_S):
    """写 stage（必要时写 config），启动并等待 ap_done。"""
    if config_val is not None:
        wr(0x3C, config_val)
    wr(0x34, stage_val)     # stage = 0/1/2
    # 清 done（读 ISR 可选，这里直接 start）
    wr(0x00, 1)             # ap_start=1
    t0 = time.time()
    while (rd(0x00) & 0x2) == 0:  # 等 bit1=ap_done
        if time.time() - t0 > timeout_s:
            raise TimeoutError(f"stage={stage_val} 超时，CTRL={hex(rd(0x00))}")
        time.sleep(0.001)
    print(f"stage-{stage_val} done; CTRL={hex(rd(0x00))}")

# ======================
# 3) 逐阶段调用：0→1→2
# ======================
# Stage 0：从 DDR 把 in0/in1 装载进核内的静态缓冲
run_stage(stage_val=0, config_val=0)

# Stage 1：计算；这里只跑 config=1（safe softmax：占位会把输出写成0）
t1 = time.time()
run_stage(stage_val=1, config_val=CONFIG_TO_RUN, timeout_s=10.0)
t2 = time.time()
print(f"[config={CONFIG_TO_RUN}] compute time (stage1) = {t2 - t1:.4f} s")

# Stage 2：把核内 buf2 写回 DDR 的 out
run_stage(stage_val=2, config_val=0)

# ======================
# 4) 读回结果并与 golden（若有）比较
# ======================
out.invalidate()
res = out.copy()
print("out[:16] =", res[:16])

if os.path.exists(golden_path):
    golden = np.fromfile(golden_path, dtype=np.uint16)
    golden = align_len(golden, N_FIXED)
    same = np.array_equal(golden, res)
    diff = np.count_nonzero(golden != res)
    print(f"[config={CONFIG_TO_RUN}] compare golden: equal={same}, diff_count={diff}")
else:
    print(f"[config={CONFIG_TO_RUN}] 未提供 golden（跳过对比）")


in0 = 0x00000000_375C0000
in1 = 0x00000000_375F0000
out = 0x00000000_37A20000
CTRL = 0x4
stage-0 done; CTRL=0x4
stage-1 done; CTRL=0x4
[config=3] compute time (stage1) = 0.0016 s
stage-2 done; CTRL=0x4
out[:16] = [16020 16218 15903 16224 16003 16095 15933 16160 15879 16212 16209 16025
 16052 16191 15886 16125]
[config=3] compare golden: equal=True, diff_count=0
