In [1]:
from extern_funcs_cython import interpolate, ln, lstm_state_calculator
from fusion import fusion_quantize_cython, prep_cython
from keyframe_buffer import KeyframeBuffer
from pynq import Overlay, allocate
import numpy as np
import math
import time
import nngen_ctrl as ng

In [2]:
params = np.load("./flattened_params/flattened_params.npz")['arr_0']

In [3]:
output_files = ['feature_half',
                'cell_state',
                'hidden_state',
                'depth_org']
cell_state_idx = output_files.index('cell_state')
output_aligned_shapes = [(1, 32, 48, 32),
                         (1, 2, 3, 512),
                         (1, 2, 3, 512),
                         (1, 64, 96, 8)]

input_files = ['reference_image',
               'hidden_state',
               'cell_state']
hidden_state_idx = len(output_files) + input_files.index('hidden_state')
input_aligned_shapes = [(1, 64, 96, 8),
                        (1, 2, 3, 512),
                        (1, 2, 3, 512)]

In [4]:
chunk_size = 64
def get_end_addr(addr, memory_size):
    return int(math.ceil((addr + memory_size) / chunk_size)) * chunk_size

def shape2size(shape):
    size = 1
    for s in shape:
        size *= s
    return size

In [5]:
axi_datawidth = 128
act_bit = 16
num_align_words = axi_datawidth // act_bit

output_offset = 0
addrs = [output_offset]
for output_aligned_shape in output_aligned_shapes:
    addrs.append(get_end_addr(addrs[-1], shape2size(output_aligned_shape) * (act_bit // 8)))

input_offset = addrs[-1]
for input_aligned_shape in input_aligned_shapes:
    addrs.append(get_end_addr(addrs[-1], shape2size(input_aligned_shape) * (act_bit // 8)))
cell_state_offset = addrs[-2] # change output cell_state addr to input addr
addrs[cell_state_idx] = cell_state_offset
hidden_state_offset = 176869824
addrs[hidden_state_idx] = hidden_state_offset
param_offset = addrs[-1]
print(output_offset, input_offset, param_offset)
print(addrs)

0 208896 319488
[0, 313344, 104448, 110592, 208896, 176869824, 313344, 319488]


In [6]:
bitfile = 'design_1.bit'
ipname = 'dvmvs_0'

overlay = Overlay(bitfile)
ip = ng.nngen_core(overlay, ipname)

In [7]:
memory_size = 1024 * 1024 * 192
buf = allocate(shape=(memory_size,), dtype=np.uint8)
buf[param_offset:param_offset + params.size] = params.view(np.int8)

In [8]:
ip.set_global_buffer(buf)
ip.write_buffer_address(cell_state_idx, cell_state_offset)
for i in range(7):
    print(ip.read_buffer_address(i))

0
313344
104448
110592
208896
307200
313344


In [9]:
def prepare_input_value(value, lshift):
    ret = value * (1 << lshift)
    ret = np.clip(ret, -1 * 2 ** (16 - 1) - 1, 2 ** (16 - 1))
    return np.round(ret.astype(np.float64)).astype(np.int16)

In [10]:
max_n_measurement_frames = 2
min_depth = 0.25
max_depth = 20.0
inverse_depth_base = 1 / max_depth
inverse_depth_multiplier = 1 / min_depth - 1 / max_depth

test_keyframe_buffer_size = 30
test_keyframe_pose_distance = 0.1
test_optimal_t_measure = 0.15
test_optimal_R_measure = 0.0

org_hidden_state = np.zeros((3072), dtype=np.int16)
org_cell_state = np.zeros((3072), dtype=np.int16)

reference_pads = np.zeros([1, 64, 96, 5], dtype=np.int16)

In [11]:
def get_warp_grid_for_cost_volume_calculation(width, height):
    x = np.linspace(0, width - 1, num=int(width))
    y = np.linspace(0, height - 1, num=int(height))
    ones = np.ones(shape=(height, width))
    x_grid, y_grid = np.meshgrid(x, y)
    warp_grid = np.stack((x_grid, y_grid, ones), axis=-1)
    warp_grid = warp_grid.astype(np.float32).reshape(-1, 3).T
    return warp_grid
warp_grid = get_warp_grid_for_cost_volume_calculation(int(96 / 2), int(64 / 2))

def round_and_clip(input):
    info = np.iinfo(np.int16)
    return np.clip(np.round(input).astype(np.int64), info.min, info.max).astype(np.int16)

In [12]:
# opcode -> (func, input.addr, input.aligned_shape, output.addr, output.aligned_shape)
externs = {0x79: (None, 0, (1, 32, 48, 32), 175680960, (1, 32, 48, 64)),
           0x102: (None, 307200, (1, 32, 48, 512), hidden_state_offset, (1, 32, 48, 512)),
           0x104: (ln(12), 176912832, (1, 2, 3, 512), 176931264, (1, 2, 3, 512)),
           0x105: (ln(12), 176937408, (1, 2, 3, 512), cell_state_offset, (1, 2, 3, 512)),
           0x107: (interpolate(4, 6, 0, 'bilinear'), 104448, (1, 2, 3, 512), 176949696, (1, 4, 6, 512)),
           0x113: (interpolate(8, 12, 0, 'bilinear'), 177035712, (1, 4, 6, 256), 177048000, (1, 8, 12, 256)),
           0x115: (interpolate(8, 12, 0, 'bilinear'), 177097152, (1, 4, 6, 8), 177097536, (1, 8, 12, 8)),
           0x120: (interpolate(16, 24, 0, 'bilinear'), 177200448, (1, 8, 12, 128), 177225024, (1, 16, 24, 128)),
           0x122: (interpolate(16, 24, 0, 'bilinear'), 177323328, (1, 8, 12, 8), 177324864, (1, 16, 24, 8)),
           0x127: (interpolate(32, 48, 0, 'bilinear'), 177539904, (1, 16, 24, 64), 177589056, (1, 32, 48, 64)),
           0x129: (interpolate(32, 48, 0, 'bilinear'), 177785664, (1, 16, 24, 8), 177791808, (1, 32, 48, 8)),
           0x134: (interpolate(64, 96, 0, 'bilinear'), 178357056, (1, 32, 48, 8), 178774848, (1, 64, 96, 8)),
           0x135: (interpolate(64, 96, 0, 'bilinear'), 178258752, (1, 32, 48, 32), 178381632, (1, 64, 96, 32))}

In [13]:
def run_extern(code, warped_image2s=None):
    if code == 0x102:
        return None
        
    func, input_addr, input_aligned_shape, output_addr, output_aligned_shape = externs[code]
    input = buf[input_addr:input_addr + shape2size(input_aligned_shape) * (act_bit // 8)].view(np.int16).reshape(input_aligned_shape)
    if input.shape[-1] == 8:
        input = input[:,:,:,:1]
    if warped_image2s is None:
        output = func(input)
    else:
        output = np.array(fusion_quantize_cython(np.array(input), warped_image2s), dtype=np.int16)
    if output.shape != output_aligned_shape:
        output = np.append(output, np.zeros((*output.shape[:-1], 7), dtype=output.dtype), axis=output.ndim-1)
    output = output.astype(np.int16).reshape(-1)
    buf[output_addr:output_addr + shape2size(output_aligned_shape) * (act_bit // 8)] = output.view(np.uint8)

    if code == 0x79: return input

In [21]:
f = open('time_fadec.txt', 'w')
test_dataset_names = ["chess-seq-01", "chess-seq-02", "fire-seq-01", "fire-seq-02", "office-seq-01", "office-seq-03", "redkitchen-seq-01", "redkitchen-seq-07"]
for test_dataset_name in test_dataset_names:
    print("Predicting %s" % test_dataset_name)
    f.write("Predicting %s\n" % test_dataset_name)
    data_npz = np.load("data_7scenes/%s.npz" % test_dataset_name)
    reference_images = data_npz["reference_image"]
    reference_poses = data_npz["reference_pose"]
    calc = lstm_state_calculator(data_npz, prepare_input_value, 14-1)
    half_K = data_npz["half_K"][0]
    inv_half_K = np.linalg.inv(half_K)

    keyframe_buffer = KeyframeBuffer(buffer_size=test_keyframe_buffer_size,
                                    keyframe_pose_distance=test_keyframe_pose_distance,
                                    optimal_t_score=test_optimal_t_measure,
                                    optimal_R_score=test_optimal_R_measure,
                                    store_return_indices=False)
    hidden_state = None
    previous_depth = None
    previous_pose = None

    start_time_total = time.time()

    idx = 0
    depths = []
    for n in range(len(reference_images)):
        start_time = time.time()
        response = keyframe_buffer.try_new_keyframe(reference_poses[n][0])

        print("evaluating %05d.png (response: %d) ..." % (n + 3, response))
        f.write("evaluating %05d.png (response: %d) ...\n" % (n + 3, response))
        if response == 2 or response == 4 or response == 5:
            continue
        elif response == 3:
            hidden_state = None
            previous_depth = None
            previous_pose = None
            continue

        reference_image_value = prepare_input_value(reference_images[n].transpose(0, 2, 3, 1), 12)
        reference_image_value = np.append(reference_image_value, reference_pads, axis=3).reshape(-1)
        addr = addrs[len(output_files)]
        buf[addr:addr + reference_image_value.size * (act_bit // 8)] = reference_image_value.view(np.uint8)
        
        ip.run()

        if response == 0:
            for i in range(len(externs)):
                code = ip.wait_extern()
                ip.resume_extern()
                if i == 0:
                    _, input_addr, input_aligned_shape, _, _ = externs[code]
                    feature_half_value = buf[input_addr:input_addr + shape2size(input_aligned_shape) * (act_bit // 8)].view(np.int16).reshape(input_aligned_shape)
                    keyframe_buffer.add_new_keyframe(reference_poses[n][0], feature_half_value.copy())
            ip.wait()
            print(time.time() - start_time)
            f.write("%.17f\n" % (time.time() - start_time))
            continue
        
        ### prepare fusion ###
        measurement_poses_value = []
        measurement_features_value = []
        frame_number_value = idx
        measurement_frames = keyframe_buffer.get_best_measurement_frames(reference_poses[n][0], max_n_measurement_frames)
        for measurement_frame in measurement_frames:
            measurement_poses_value.append(measurement_frame[0])
            measurement_features_value.append(measurement_frame[1])

        inv_pose2s = np.linalg.inv(np.array(measurement_poses_value))
        warped_image2s = np.array(prep_cython(len(measurement_frames), np.array(measurement_features_value), half_K, inv_half_K, reference_poses[n][0], inv_pose2s, warp_grid), dtype=np.int16)

        for i in range(len(externs)):
            code = ip.wait_extern()
            if i == 0:
                feature_half_value = run_extern(code, warped_image2s)
                ip.resume_extern()

                ### prepare hidden_state and cell_state ###
                if previous_depth is not None:
                    hidden_state_value = calc(hidden_state, previous_depth, previous_pose, reference_poses[n]).reshape(-1)
                    addr = addrs[hidden_state_idx]
                    buf[addr:addr + hidden_state_value.size * (act_bit // 8)] = hidden_state_value.view(np.uint8)
                else:
                    addr = addrs[hidden_state_idx+1]
                    buf[addr:addr + org_cell_state.size * (act_bit // 8)] = org_cell_state.view(np.uint8)
                    addr = addrs[hidden_state_idx]
                    buf[addr:addr + org_hidden_state.size * (act_bit // 8)] = org_hidden_state.view(np.uint8)
            elif i == 1:
                ip.resume_extern()
                keyframe_buffer.add_new_keyframe(reference_poses[n][0], feature_half_value.copy())
            else:
                run_extern(code)
                ip.resume_extern()

        ### prepare hidden_state and cell_state ###
        addr = addrs[2]
        output_aligned_shape = output_aligned_shapes[2]
        hidden_state = buf[addr:addr + shape2size(output_aligned_shape) * (act_bit // 8)].view(np.int16).reshape(output_aligned_shape)

        previous_pose = reference_poses[n]

        ip.wait()

        ### prepare previous_depth and hidden_state ###
        addr = addrs[3]
        output_aligned_shape = output_aligned_shapes[3]
        depth_org = buf[addr:addr + shape2size(output_aligned_shape) * (act_bit // 8)].view(np.int16).reshape(output_aligned_shape)
        depth_org = depth_org[:,:,:,:1]
        depth_org = (depth_org.transpose(0, 3, 1, 2) / (1 << 14)).astype(np.float32)
        inverse_depth_full = inverse_depth_multiplier * depth_org + inverse_depth_base
        previous_depth = 1.0 / inverse_depth_full

        print(time.time() - start_time)
        f.write("%.17f\n" % (time.time() - start_time))

        depths.append(previous_depth)
        idx += 1

    print(time.time() - start_time_total)
    f.write("%.17f\n" % (time.time() - start_time_total))

    np.savez_compressed("depths/%s" % test_dataset_name, depths=depths)
f.close()

Predicting chess-seq-01
evaluating 00003.png (response: 0) ...
0.2270979881286621
evaluating 00004.png (response: 1) ...
0.26862168312072754
evaluating 00005.png (response: 1) ...
0.31209492683410645
evaluating 00006.png (response: 1) ...
0.2659752368927002
evaluating 00007.png (response: 1) ...
0.2711179256439209
evaluating 00008.png (response: 1) ...
0.3214750289916992
evaluating 00009.png (response: 1) ...
0.28626275062561035
evaluating 00010.png (response: 1) ...
0.2768685817718506
evaluating 00011.png (response: 1) ...
0.26636815071105957
evaluating 00012.png (response: 1) ...
0.3115694522857666
evaluating 00013.png (response: 1) ...
0.2681155204772949
evaluating 00014.png (response: 1) ...
0.2664635181427002
evaluating 00015.png (response: 1) ...
0.2744786739349365
evaluating 00016.png (response: 1) ...
0.30304431915283203
evaluating 00017.png (response: 1) ...
0.26759910583496094
evaluating 00018.png (response: 1) ...
0.3244500160217285
evaluating 00019.png (response: 1) ...
0.2