# Weight, Scale 불러오기

In [318]:
import numpy as np
rel_path = './'

In [319]:
def clip(input, qbit):
  max_value = 2. ** (qbit-1) -1.
  min_value = -2. ** (qbit-1)
  output = np.clip(input, min_value, max_value)
  return output

def uniform_quantize(input, qbit):
  min_value = min(input)
  max_value = max(input)
  abs_min_value = abs(min_value)
  abs_max_value = abs(max_value)
  if (abs_max_value >= abs_min_value):
    min_cond = -abs_max_value
    max_cond = abs_max_value
  else:
    min_cond = -abs_min_value
    max_cond = abs_min_value
  qmin = 0
  qmax = 2. ** qbit - 1.
  scale = (max_cond - min_cond) / (qmax - qmin)
  output = input / scale
  output = np.floor(output)
  output = clip(input=output, qbit=qbit)
  return output, scale

In [320]:
wdir = rel_path + 'tmpdata/'

fp_w0 = wdir + 'w0.fc1.weight.csv'
fp_w0_scale = wdir + 'w0.fc1.weight_scale.csv'
np_w0 = np.loadtxt(fp_w0, delimiter=',', dtype=float)
np_w0_scale = np.loadtxt(fp_w0_scale, delimiter=',', dtype=float)

fp_w2 = wdir + 'w2.fc2.weight.csv'
fp_w2_scale = wdir + 'w2.fc2.weight_scale.csv'
np_w2 = np.loadtxt(fp_w2, delimiter=',', dtype=float)
np_w2_scale = np.loadtxt(fp_w2_scale, delimiter=',', dtype=float)


# Only-with Python

In [48]:
def fc1_fc2_inference(_image, np_w0, np_w0_scale, np_w2, np_w2_scale):
    # Quantize input
    ia0_q, scale_ia0 = uniform_quantize(_image, qbit=8)

    # ------------------------------------------
    # 2) FC1 (int) → rescale to FP
    # output size: (128)
    # ------------------------------------------
    # int MAC result
    fc1_int = np_w0.dot(ia0_q)     # shape (128)

    # scale to FP (same as FPGA)
    fc1_fp = fc1_int * scale_ia0 * np_w0_scale

    # ------------------------------------------
    # 3) ReLU
    # ------------------------------------------
    act = np.maximum(fc1_fp, 0.0)

    # ------------------------------------------
    # 4) quantize activation for FC2
    # ------------------------------------------
    ia2_q, scale_ia2 = uniform_quantize(act, qbit=8)

    # ------------------------------------------
    # 5) FC2
    # output size: (10)
    # ------------------------------------------
    fc2_int = np_w2.dot(ia2_q)

    # scale to FP
    fc2_fp = fc2_int * scale_ia2 * np_w2_scale

    # ------------------------------------------
    # 6) softmax / argmax
    # ------------------------------------------
    infer = np.argmax(fc2_fp)

    return infer, fc2_fp

In [49]:
wdir = rel_path + 'tmpdata/'
fp_input_csv = wdir + 'mnist_test_small.csv'
np_input_csv = np.loadtxt(fp_input_csv, delimiter=',', dtype=float)

print(np_input_csv.shape)
print(np_input_csv)

(10, 785)
[[ 7.  0.  0. ...,  0.  0.  0.]
 [ 2.  0.  0. ...,  0.  0.  0.]
 [ 1.  0.  0. ...,  0.  0.  0.]
 ..., 
 [ 9.  0.  0. ...,  0.  0.  0.]
 [ 5.  0.  0. ...,  0.  0.  0.]
 [ 9.  0.  0. ...,  0.  0.  0.]]


In [50]:
# reshape weights
np_w0 = np_w0.reshape(128, 784)
np_w2 = np_w2.reshape(10, 128)

# MNIST 784 input
for x in range(10):
    img = np_input_csv[x][1:785].astype(float) / 255.0
    
    infer, logits = fc1_fc2_inference(
        img,
        np_w0, np_w0_scale,
        np_w2, np_w2_scale
    )

    print("inference =", infer)

inference = 7
inference = 2
inference = 1
inference = 0
inference = 4
inference = 1
inference = 4
inference = 9
inference = 6
inference = 9


# With Pynq-z2

In [321]:
# PYNQ-Z2 Initialize
from pynq import Overlay, Interrupt
ol = Overlay("./design_top.bit")
top = ol.top_0

def dec_to_tc(data, bit=8):
    mask = (1 << bit) - 1
    if data >= 0:
        return data
    else:
        datatc = pow(2,bit-1) + (pow(2,bit-1)+data)
        return int(datatc)
def tc_to_dec(data, bit=32):
    if data >= pow(2,bit-1):
        datadec = data-2*pow(2,bit-1)
        return int(datadec)
    else:
        return data
def instr_param(opvalid, opcode, param, data):
    _instr = opvalid * pow(2,31)
    _instr += opcode * pow(2,28)
    _instr += param * pow(2,8)
    _instr += data * pow(2,0)
    return int(_instr)    

def instr_data(opvalid, opcode, sel, addr, data):
    _instr = opvalid * pow(2,31)
    _instr += opcode * pow(2,28)
    _instr += sel * pow(2,24)
    _instr += addr * pow(2,8)
#     _instr += data * pow(2,0)
    # 2's compliment를 data에 넣는다.
    _instr += dec_to_tc(data)
    return int(_instr)
def pl_rst():
    top.mmio.write(offset=0, data=dec_to_tc(data=-1,bit=32))
    top.mmio.write(offset=0, data=0)
def finish_check():
    status = top.mmio.read(offset=4)
    return (status >> 31) & 0x1
def valid_check():
    status = top.mmio.read(offset=4)
    return (status >> 30) & 0x1

def wb_param_and_read(param):
    top.mmio.write(offset=0, data=instr_param(opvalid=1, opcode=OPCODE_WBPARAM, param=param, data=0))
    while not valid_check():
        pass
    return top.mmio.read(offset=12)
def wb_psram_and_read(sel, addr):
    top.mmio.write(offset=0, data=instr_data(opvalid=1, opcode=OPCODE_WBPSRAM, sel=sel, addr=addr, data=0))
    while not valid_check():
        pass
    return tc_to_dec(top.mmio.read(offset=12), bit=32)

def weight_load(sel,addr,data):
    while True:
        status = top.mmio.read(4)
        pending = (status) & 0x1
        if pending == 0:
            break
    top.mmio.write(offset=0, data=instr_data(opvalid=1, opcode=OPCODE_LDSRAM, sel=sel, addr=addr, data=data))

# parameters setting
OPCODE_NOP = 0
OPCODE_PARAM = 1
OPCODE_LDSRAM = 2
OPCODE_STSRAM = 3
OPCODE_EX = 4
OPCODE_WBPSRAM = 5
OPCODE_WBPARAM = 6

PARAM_BASE_WSRAM = 0
PARAM_S = 1
PARAM_OC = 2
PARAM_IC = 3
PARAM_TRG = 4

PARAM_IC_WH = 5
PARAM_BASE_WSRAM_WH = 6

TRG_ISRAM = 0
TRG_WSRAM = 1
TRG_PSRAM = 2

pl_rst()

In [322]:
# FC1의 Weight 적재
# Load np_w0
top.mmio.write(0, instr_param(1, OPCODE_PARAM, PARAM_BASE_WSRAM, 0))
top.mmio.write(0, instr_param(1, OPCODE_PARAM, PARAM_BASE_WSRAM_WH, 0))
print(wb_param_and_read(PARAM_BASE_WSRAM))

top.mmio.write(offset=0, data=instr_param(opvalid=1, opcode=OPCODE_PARAM, param=PARAM_TRG, data=TRG_WSRAM))
for i in range(128):
    for j in range(784):
        x = i % 4
        y = int(np.floor(i/4)) * 784 + j
        top.mmio.write(offset=0, data=instr_data(opvalid=1, opcode=OPCODE_LDSRAM, sel=x, addr=y, data=np_w0[i*784+j]))

0


In [323]:
# FC2의 Weight 적재
# Load np_w2
# Base Address 계산
top.mmio.write(0, instr_param(1, OPCODE_PARAM,
                             PARAM_BASE_WSRAM, int(128/4 * 784)))
top.mmio.write(0, instr_param(1, OPCODE_PARAM,
                             PARAM_BASE_WSRAM_WH, int(128/4 * 784 / 256)))
print(wb_param_and_read(PARAM_BASE_WSRAM))

top.mmio.write(offset=0, data=instr_param(opvalid=1, opcode=OPCODE_PARAM, param=PARAM_TRG, data=TRG_WSRAM))
for i in range(10):
    for j in range(128):
        x = i % 4
        y = int(np.floor(i/4)) * 128 + j
        top.mmio.write(offset=0, data=instr_data(opvalid=1, opcode=OPCODE_LDSRAM, sel=x, addr=y, data=np_w2[i*128+j]))

25088


In [324]:
# Parameter Test
top.mmio.write(offset=0, data=instr_param(opvalid=1, opcode=OPCODE_PARAM, param=PARAM_S, data=1))
top.mmio.write(offset=0, data=instr_param(opvalid=1, opcode=OPCODE_PARAM, param=PARAM_OC, data=128))
top.mmio.write(offset=0, data=instr_param(opvalid=1, opcode=OPCODE_PARAM, param=PARAM_IC, data=784%256))
top.mmio.write(offset=0, data=instr_param(opvalid=1, opcode=OPCODE_PARAM, param=PARAM_IC_WH, data=int(784/256)))
print(wb_param_and_read(PARAM_S))
print(wb_param_and_read(PARAM_OC))
print(wb_param_and_read(PARAM_IC))

1
128
784


In [325]:
wdir = rel_path + 'tmpdata/'
fp_input_csv = wdir + 'mnist_test_small.csv'
np_input_csv = np.loadtxt(fp_input_csv, delimiter=',', dtype=float)

print(np_input_csv.shape)

from pynq import Clocks
Clocks.fclk0_mhz

(10, 785)


15.625

In [326]:
testsize = 10

for b in range(testsize):

    # ----------------------------------------------
    # 0. 라벨 & 입력 이미지 로드 (Flatten된 MNIST 784-dim)
    # ----------------------------------------------
    _label = int(np_input_csv[b][0])
    _image = np_input_csv[b][1:785]


    # ================================================================
    # 1. FC1 Layer (Input 784 → Hidden 128) 실행
    # ================================================================

    # -------- FC1: Parameter 설정 --------
    top.mmio.write(0, instr_param(1, OPCODE_PARAM, PARAM_S, 1))
    top.mmio.write(0, instr_param(1, OPCODE_PARAM, PARAM_IC, 784 % 256))
    top.mmio.write(0, instr_param(1, OPCODE_PARAM, PARAM_IC_WH, int(784 / 256)))
    top.mmio.write(0, instr_param(1, OPCODE_PARAM, PARAM_OC, 128))

    # -------- FC1: 입력 quantization --------
    np_ia0_q, scale_ia0 = uniform_quantize(_image, qbit=8)

    # -------- FC1: 입력 ISRAM에 적재 --------
    top.mmio.write(0, instr_param(1, OPCODE_PARAM, PARAM_TRG, TRG_ISRAM))

    for i in range(784):
        x = int(i / 8)               # word index
        y = i % 8                    # byte index
        top.mmio.write(0, instr_data(1, OPCODE_LDSRAM, y, x, np_ia0_q[i]))

    # -------- FC1: WSRAM base 설정 + 실행 명령 --------
    top.mmio.write(0, instr_param(1, OPCODE_PARAM, PARAM_BASE_WSRAM, 0))
    top.mmio.write(0, instr_param(1, OPCODE_PARAM, PARAM_BASE_WSRAM_WH, 0))
    top.mmio.write(0, instr_param(1, OPCODE_EX, 0, 0))
    top.mmio.write(0, instr_param(1, OPCODE_NOP, 0, 0))

    np_oa0 = np.zeros(128, dtype=float)

    # -------- FC1: 연산 완료 대기 --------
    while not finish_check():
        pass

    # -------- FC1: 결과 PSRAM에서 읽기 --------
    for i in range(int(np.ceil(128 / 4))):
        for j in range(4):
            idx = j + 4 * i
            if idx < 128:
                out_val = wb_psram_and_read(sel=j, addr=i)
                np_oa0[idx] = out_val * scale_ia0 * np_w0_scale

    top.mmio.write(0, instr_param(1, OPCODE_NOP, 0, 0))

    #print(np_oa0)
    # ================================================================
    # 2. FC2 Layer (Hidden 128 → Output 10) 실행
    # ================================================================

    # -------- FC2: Parameter 설정 --------
    top.mmio.write(0, instr_param(1, OPCODE_PARAM, PARAM_S, 1))
    top.mmio.write(0, instr_param(1, OPCODE_PARAM, PARAM_IC, 128))
    top.mmio.write(0, instr_param(1, OPCODE_PARAM, PARAM_IC_WH, 0))  # 128 < 256
    top.mmio.write(0, instr_param(1, OPCODE_PARAM, PARAM_OC, 10))

    # -------- FC2: 활성화(ReLU) 적용 --------
    np_ia2 = np.maximum(np_oa0, 0)

    # -------- FC2: 입력 quantization --------
    np_ia2_q, scale_ia2 = uniform_quantize(np_ia2, qbit=8)

    # -------- FC2: 입력 ISRAM 적재 --------
    top.mmio.write(0, instr_param(1, OPCODE_PARAM, PARAM_TRG, TRG_ISRAM))

    for i in range(128):
        x = int(i / 8)
        y = i % 8
        top.mmio.write(0, instr_data(1, OPCODE_LDSRAM, y, x, np_ia2_q[i]))

    # -------- FC2: Weight base 설정 --------
    top.mmio.write(0, instr_param(1, OPCODE_PARAM,
                                 PARAM_BASE_WSRAM, int(128/4 * 784)))
    top.mmio.write(0, instr_param(1, OPCODE_PARAM,
                                 PARAM_BASE_WSRAM_WH, int(128/4 * 784 / 256)))

    # -------- FC2: 실행 명령 --------
    top.mmio.write(0, instr_param(1, OPCODE_EX, 0, 0))
    top.mmio.write(0, instr_param(1, OPCODE_NOP, 0, 0))

    np_oa2 = np.zeros(10, dtype=float)

    # -------- FC2: 연산 완료 대기 --------
    while not finish_check():
        pass

    # -------- FC2: 결과 읽기 --------
    for i in range(int(np.ceil(10 / 4))):
        for j in range(4):
            idx = j + 4 * i
            if idx < 10:
                out_val = wb_psram_and_read(sel=j, addr=i)
                np_oa2[idx] = out_val * scale_ia2 * np_w2_scale

    top.mmio.write(0, instr_param(1, OPCODE_NOP, 0, 0))


    # ================================================================
    # 3. Inference 출력
    # ================================================================
    _infer = np.argmax(np_oa2)
    print(f"inference: {_infer}, answer: {_label}")


inference: 7, answer: 7
inference: 2, answer: 2
inference: 1, answer: 1
inference: 0, answer: 0
inference: 4, answer: 4
inference: 1, answer: 1
inference: 4, answer: 4
inference: 9, answer: 9
inference: 6, answer: 5
inference: 9, answer: 9


## 단일 레이어 동작 실험

In [283]:
# FC1의 Weight 적재
# Load np_w0
top.mmio.write(0, instr_param(1, OPCODE_PARAM, PARAM_BASE_WSRAM, 0))
top.mmio.write(0, instr_param(1, OPCODE_PARAM, PARAM_BASE_WSRAM_WH, 0))
print(wb_param_and_read(PARAM_BASE_WSRAM))

top.mmio.write(offset=0, data=instr_param(opvalid=1, opcode=OPCODE_PARAM, param=PARAM_TRG, data=TRG_WSRAM))
for i in range(128):
    for j in range(784):
        w = 1 if i == 5 else 0
        x = i % 4
        y = baseaddr + int(np.floor(i/4)) * 784 + j
        top.mmio.write(offset=0, data=instr_data(opvalid=1, opcode=OPCODE_LDSRAM, sel=x, addr=y, data=2))#np_w0[i*784+j]))

0


In [284]:
# ================================================================
# 1. FC1 Layer (Input 784 → Hidden 128) 실행
# ================================================================

# -------- FC1: Parameter 설정 --------
top.mmio.write(0, instr_param(1, OPCODE_PARAM, PARAM_S, 1))
top.mmio.write(0, instr_param(1, OPCODE_PARAM, PARAM_IC, 784 % 256))
top.mmio.write(0, instr_param(1, OPCODE_PARAM, PARAM_IC_WH, int(784 / 256)))
top.mmio.write(0, instr_param(1, OPCODE_PARAM, PARAM_OC, 128))

# -------- FC1: 입력 quantization --------
#np_ia0_q, scale_ia0 = uniform_quantize(_image, qbit=8)
#print(np_ia0_q)
#print(scale_ia0)

# -------- FC1: 입력 ISRAM에 적재 --------
top.mmio.write(0, instr_param(1, OPCODE_PARAM, PARAM_TRG, TRG_ISRAM))

for i in range(784):
    x = int(i / 8)               # word index
    y = i % 8                    # byte index
    top.mmio.write(0, instr_data(1, OPCODE_LDSRAM, y, x, 1)) ## np_ia0_q[i]))

# -------- FC1: WSRAM base 설정 + 실행 명령 --------
top.mmio.write(0, instr_param(1, OPCODE_PARAM, PARAM_BASE_WSRAM, 0))
top.mmio.write(0, instr_param(1, OPCODE_EX, 0, 0))
top.mmio.write(0, instr_param(1, OPCODE_NOP, 0, 0))

np_oa0 = np.zeros(128, dtype=float)

# -------- FC1: 연산 완료 대기 --------
while not finish_check():
    pass

# -------- FC1: 결과 PSRAM에서 읽기 --------
for i in range(int(np.ceil(128 / 4))):
    for j in range(4):
        idx = j + 4 * i
        if idx < 128:
            out_val = wb_psram_and_read(sel=j, addr=i)
            np_oa0[idx] = out_val
            #np_oa0[idx] = out_val * scale_ia0 * np_w0_scale

top.mmio.write(0, instr_param(1, OPCODE_NOP, 0, 0))
print(np_oa0)

[ 1568.  1568.  1568.  1568.  1568.  1568.  1568.  1568.  1568.  1568.
  1568.  1568.  1568.  1568.  1568.  1568.  1568.  1568.  1568.  1568.
  1568.  1568.  1568.  1568.  1568.  1568.  1568.  1568.  1568.  1568.
  1568.  1568.  1568.  1568.  1568.  1568.  1568.  1568.  1568.  1568.
  1568.  1568.  1568.  1568.  1568.  1568.  1568.  1568.  1568.  1568.
  1568.  1568.  1568.  1568.  1568.  1568.  1568.  1568.  1568.  1568.
  1568.  1568.  1568.  1568.  1568.  1568.  1568.  1568.  1568.  1568.
  1568.  1568.  1568.  1568.  1568.  1568.  1568.  1568.  1568.  1568.
  1568.  1568.  1568.  1568.  1568.  1568.  1568.  1568.  1568.  1568.
  1568.  1568.  1568.  1568.  1568.  1568.  1568.  1568.  1568.  1568.
  1568.  1568.  1568.  1568.  1568.  1568.  1568.  1568.  1568.  1568.
  1568.  1568.  1568.  1568.  1568.  1568.  1568.  1568.  1568.  1568.
  1568.  1568.  1568.  1568.  1568.  1568.  1568.  1568.]


In [279]:
# FC2의 Weight 적재
# Load np_w2
# Base Address 계산
top.mmio.write(0, instr_param(1, OPCODE_PARAM,
                             PARAM_BASE_WSRAM, int(128/4 * 784)))
top.mmio.write(0, instr_param(1, OPCODE_PARAM,
                             PARAM_BASE_WSRAM_WH, int(128/4 * 784 / 256)))
print(wb_param_and_read(PARAM_BASE_WSRAM))

top.mmio.write(offset=0, data=instr_param(opvalid=1, opcode=OPCODE_PARAM, param=PARAM_TRG, data=TRG_WSRAM))
for i in range(10):
    for j in range(128):
        w = 1 if i == 7 else 0
        x = i % 4
        y = int(np.floor(i/4)) * 128 + j
        top.mmio.write(offset=0, data=instr_data(opvalid=1, opcode=OPCODE_LDSRAM, sel=x, addr=y, data=1))

25088


In [280]:
# ================================================================
# 2. FC2 Layer (Hidden 128 → Output 10) 실행
# ================================================================

# -------- FC2: Parameter 설정 --------
top.mmio.write(0, instr_param(1, OPCODE_PARAM, PARAM_S, 1))
top.mmio.write(0, instr_param(1, OPCODE_PARAM, PARAM_IC, 128))
top.mmio.write(0, instr_param(1, OPCODE_PARAM, PARAM_IC_WH, 0))  # 128 < 256
top.mmio.write(0, instr_param(1, OPCODE_PARAM, PARAM_OC, 10))

# -------- FC2: 활성화(ReLU) 적용 --------
#np_ia2 = np.maximum(np_oa0, 0)

# -------- FC2: 입력 quantization --------
#np_ia2_q, scale_ia2 = uniform_quantize(np_ia2, qbit=8)

# -------- FC2: 입력 ISRAM 적재 --------
top.mmio.write(0, instr_param(1, OPCODE_PARAM, PARAM_TRG, TRG_ISRAM))

for i in range(128):
    x = int(i / 8)
    y = i % 8
    top.mmio.write(0, instr_data(1, OPCODE_LDSRAM, y, x, 1)) #np_ia2_q[i]))

# -------- FC2: Weight base 설정 --------
top.mmio.write(0, instr_param(1, OPCODE_PARAM,
                             PARAM_BASE_WSRAM, int(128/4 * 784)))
top.mmio.write(0, instr_param(1, OPCODE_PARAM,
                             PARAM_BASE_WSRAM_WH, int(128/4 * 784 / 256)))
print(wb_param_and_read(PARAM_BASE_WSRAM))

# -------- FC2: 실행 명령 --------
top.mmio.write(0, instr_param(1, OPCODE_EX, 0, 0))
top.mmio.write(0, instr_param(1, OPCODE_NOP, 0, 0))

np_oa2 = np.zeros(10, dtype=float)

# -------- FC2: 연산 완료 대기 --------
while not finish_check():
    pass

# -------- FC2: 결과 읽기 --------
for i in range(int(np.ceil(10 / 4))):
    for j in range(4):
        idx = j + 4 * i
        if idx < 10:
            out_val = wb_psram_and_read(sel=j, addr=i)
            #print(out_val)
            np_oa2[idx] = out_val #* scale_ia2 * np_w2_scale
            
print(np_oa2)
top.mmio.write(0, instr_param(1, OPCODE_NOP, 0, 0))


25088
[ 128.  128.  128.  128.  128.  128.  128.  128.  128.  128.]
