**import onnxruntime (again) to avoid version error**

In [1]:
import onnx 
import torch 
import onnxruntime
import os
    
build_dir = os.environ["FINN_BUILD_DIR"]

version check for debugging 

In [2]:
print(onnx.__version__)
print(onnxruntime.__version__)

1.11.0
1.11.1


from skl2onnx.helpers.onnx_helper import load_onnx_model
model_onnx = load_onnx_model("identify_before_transpose.onnx")
for out in enumerate_model_node_outputs(model_onnx):
    print(out)

## 1. Import model into FINN with ModelWrapper <a id="brevitas_import_visualization"></a>

Now that we have the model in .onnx format, we can work with it using FINN. To import it into FINN, we'll use the [`ModelWrapper`](https://finn.readthedocs.io/en/latest/source_code/finn.core.html#qonnx.core.modelwrapper.ModelWrapper). It is a wrapper around the ONNX model which provides several helper functions to make it easier to work with the model.

In [3]:
from qonnx.core.modelwrapper import ModelWrapper
from qonnx.util.cleanup import cleanup_model

ready_model_filename = "tiny_FCinter_tnaive.onnx"
model_for_sim = ModelWrapper(ready_model_filename)

model_for_sim = cleanup_model(model_for_sim)

                i.e. domain=finn to domain=qonnx.custom_op.<general|fpgadataflow|...>


/channel_interaction/fc_channel/export_handler/Constant_output_0
[4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, 64, 66, 68, 70, 72, 74]


Exception: Found multiple get_by_name matches, undefined behavior

In [None]:
def count_equal_nodes(model):
    count_dict = {}
    for node in model.graph.node:
        if node.op_type in count_dict:
            count_dict[node.op_type] +=1
        else:
            count_dict[node.op_type] = 1
    return count_dict

In [None]:
print(model_for_sim.analysis(count_equal_nodes))

In [None]:
from finn.util.visualization import showInNetron

In [None]:
showInNetron("tiny_identity_tnaive.onnx")

"""
Let's have a look at some of the member functions exposed by `ModelWrapper` 
to see what kind of information we can extract from it.
"""
#dir(model_for_sim)

we can extract the shape and datatype annotation for various tensors in the graph, as well as information related to the operation types associated with each node.

from qonnx.core.datatype import DataType

finnonnx_in_tensor_name = model_for_sim.graph.input[0].name
finnonnx_out_tensor_name = model_for_sim.graph.output[0].name
print("Input tensor name: %s" % finnonnx_in_tensor_name)
print("Output tensor name: %s" % finnonnx_out_tensor_name)
finnonnx_model_in_shape = model_for_sim.get_tensor_shape(finnonnx_in_tensor_name)
finnonnx_model_out_shape = model_for_sim.get_tensor_shape(finnonnx_out_tensor_name)
print("Input tensor shape: %s" % str(finnonnx_model_in_shape))
print("Output tensor shape: %s" % str(finnonnx_model_out_shape))
finnonnx_model_in_dt = model_for_sim.get_tensor_datatype(finnonnx_in_tensor_name)
finnonnx_model_out_dt = model_for_sim.get_tensor_datatype(finnonnx_out_tensor_name)
print("Input tensor datatype: %s" % str(finnonnx_model_in_dt.name))
print("Output tensor datatype: %s" % str(finnonnx_model_out_dt.name))
print("List of node operator types in the graph: ")
print([x.op_type for x in model_for_sim.graph.node])

## 2. Network preparation: Tidy-up transformations <a id="network_preparations"></a>

all the intermediate tensors need to have statically defined shapes.

These transformations are:

- GiveUniqueNodeNames
- GiveReadableTensorNames
- InferShapes
- InferDataTypes
- FoldConstants
- RemoveStaticGraphInputs


In [None]:
from qonnx.transformation.general import GiveReadableTensorNames, GiveUniqueNodeNames, RemoveStaticGraphInputs
from qonnx.transformation.infer_shapes import InferShapes
from qonnx.transformation.infer_datatypes import InferDataTypes
from qonnx.transformation.fold_constants import FoldConstants

model_for_sim = model_for_sim.transform(InferShapes())
model_for_sim = model_for_sim.transform(FoldConstants())
model_for_sim = model_for_sim.transform(GiveUniqueNodeNames())
model_for_sim = model_for_sim.transform(GiveReadableTensorNames())
model_for_sim = model_for_sim.transform(InferDataTypes())
model_for_sim = model_for_sim.transform(RemoveStaticGraphInputs())

tidy_model_filename = "tidy.onnx"
model_for_sim.save(tidy_model_filename)

In [None]:
showInNetron(tidy_model_filename)

for n in model_for_sim.graph.node:
    if n.name == 'Mul_18':
        mul_node = n
    elif n.name == 'MatMul_1':
        matmul_node = n
    elif n.name == 'Transpose_3':
        transpose_node = n
initializer_list = model_for_sim.graph.initializer

for x in initializer_list:
    if x.name == 'Gather_36_param0':
        gather_initializer = x
        
#output = transpose_node.output
#print(initializer_list)
print(gather_initializer)
initailizer = model_for_sim.get_initializer('Unsqueeze_36_out0')
print(initailizer)
#model_for_sim.set_initializer(mul_node,initializer)



model_for_sim.set_initializer(mul_node,gather_initializer)
model_for_sim.set_initializer(matmul_node,gather_initializer)

### Adding Pre- and Postprocessing <a id='prepost'></a>

Preprocessing and postprocessing steps can be added directly in the ONNX graph.  <span style="color:red">
In FINN, we can bake some of these pre/postprocessing operatings into the graph, and in some cases these can be highly beneficial for performance by allowing our accelerator to directly consume raw data instead of going through CPU preprocessing.

In [None]:
from qonnx.transformation.insert_topk import InsertTopK
from qonnx.transformation.infer_datatypes import InferDataTypes

# postprocessing: insert Top-1 node at the end
model = ModelWrapper(tidy_model_filename)
model = model.transform(InsertTopK(k=1))
chkpt_name = "tinyHAR_pre_post.onnx"
# tidy-up again
model = model.transform(InferShapes())
model = model.transform(FoldConstants())
model = model.transform(GiveUniqueNodeNames())
model = model.transform(GiveReadableTensorNames())
model = model.transform(InferDataTypes())
model = model.transform(RemoveStaticGraphInputs())
model.save(chkpt_name)

In [None]:
showInNetron(chkpt_name)

### information for manually partition

In [None]:
"""
manually partiton
"""
from qonnx.core.modelwrapper import ModelWrapper

for n in model.graph.node:
    if n.name == 'Add_1':
        add_1 = n
        print("Add_1 " + str(model_for_sim.get_node_index(n)))
    elif n.name == 'MatMul_3':
        matMul_3 = n
        print("MatMul_3 " + str(model_for_sim.get_node_index(n)))
    elif n.name == 'Reshape_0':
        reshape_0 = n
        print("Reshape_0 " + str(model_for_sim.get_node_index(n)))
    elif n.name == 'Transpose_0':
        transpose_0 = n
        print("Transpose_0 " + str(model_for_sim.get_node_index(n)))
    elif n.name == 'Transpose_3':
        transpose_3 = n
        print("Transpose_3 " + str(model_for_sim.get_node_index(n)))

In [None]:
#partition_dir
import qonnx.transformation.create_generic_partitions as partition
import finn.transformation.fpgadataflow.cleanup as cleanup

# Set partition_dir to the current directory
partition_dir = "/home/bian/finn/notebooks/TinyHAR"

model = ModelWrapper("tinyHAR_pre_post.onnx")
model = model.transform(cleanup.CleanUp())
model = model.transform(InferShapes())

model = model.transform(partition.PartitionFromDict({0 : range(0, 35), 1 : range(111, 127)}, partition_dir = partition_dir))

In [None]:
model.save("partition.onnx")

## 2. How FINN Implements Convolutions: Lowering and Streamlining

In FINN, we implement convolutions with the *lowering* approach: we convert them to matrix-matrix multiply operations, where one of the matrices is generated by sliding a window over the input image. You can read more about the sliding window operator and how convolution lowering works [in this notebook](https://github.com/maltanar/qnn-inference-examples/blob/master/3-convolutional-binarized-gtsrb.ipynb). The streaming dataflow architecture we will end up with is going to look something like this figure from the [FINN-R paper](https://arxiv.org/abs/1809.04570):

![](cnv-mp-fc.png)

Note how the convolution layer looks very similar to the fully connected one in terms of the matrix-vector-threshold unit (MVTU), but now the MVTU is preceded by a sliding window unit that produces the matrix from the input image. All of these building blocks, including the `MaxPool` layer you see in this figure, exist as templated Vivado HLS C++ functions in [finn-hlslib](https://github.com/Xilinx/finn-hlslib).


To target this kind of hardware architecture with our network we'll apply a convolution lowering transformation, in addition to streamlining. You may recall the *streamlining transformation* that we applied to the TFC-w1a1 network, which is a series of mathematical simplifications that allow us to get rid of floating point scaling operations by implementing few-bit activations as thresholding operations. 

**The current implementation of streamlining is highly network-specific and may not work for your network if its topology is very different than the example network here. We hope to rectify this in future releases.**

### part0

In [None]:
from finn.transformation.streamline import Streamline
from qonnx.transformation.lower_convs_to_matmul import LowerConvsToMatMul
from qonnx.transformation.bipolar_to_xnor import ConvertBipolarMatMulToXnorPopcount
import finn.transformation.streamline.absorb as absorb
import finn.transformation.streamline.reorder as reorder
from finn.transformation.streamline.reorder import MakeMaxPoolNHWC, MoveScalarLinearPastInvariants
from qonnx.transformation.infer_data_layouts import InferDataLayouts
from qonnx.transformation.general import RemoveUnusedTensors

import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls

model = ModelWrapper("partition_0.onnx")
model = model.transform(Streamline())
#conv1d, shape kernel not 2d
model = model.transform(LowerConvsToMatMul())
#TinyHAR has no Maxpool
#model = model.transform(MakeMaxPoolNHWC())

model = model.transform(absorb.AbsorbTransposeIntoMultiThreshold())

#model = model.transform(ConvertBipolarMatMulToXnorPopcount())
model = model.transform(Streamline())
#absorb final add-mul nodes into TopK
model = model.transform(absorb.AbsorbScalarMulAddIntoTopK())
#infer_data_layouts.py:119: UserWarning: Assuming 4D input is NCHW
model = model.transform(InferDataLayouts())
model = model.transform(RemoveUnusedTensors())
model.save("part0_streamlined.onnx")

### part 1

In [None]:
from finn.transformation.streamline import Streamline
from qonnx.transformation.lower_convs_to_matmul import LowerConvsToMatMul
from qonnx.transformation.bipolar_to_xnor import ConvertBipolarMatMulToXnorPopcount
import finn.transformation.streamline.absorb as absorb
import finn.transformation.streamline.reorder as reorder
from qonnx.transformation.change_3d_tensors_to_4d import Change3DTo4DTensors
from finn.transformation.streamline.reorder import MakeMaxPoolNHWC, MoveScalarLinearPastInvariants
from qonnx.transformation.infer_data_layouts import InferDataLayouts
from qonnx.transformation.general import RemoveUnusedTensors

import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls

model = ModelWrapper("partition_1.onnx")

#conv 1d
model = model.transform(Change3DTo4DTensors())
model = model.transform(absorb.AbsorbScalarMulAddIntoTopK())

#conv1d, shape kernel not 2d
model = model.transform(LowerConvsToMatMul())
#TinyHAR has no Maxpool
#model = model.transform(MakeMaxPoolNHWC())

model = model.transform(absorb.AbsorbTransposeIntoMultiThreshold())

#conv1d, shape kernel not 2d
model = model.transform(LowerConvsToMatMul())
#TinyHAR has no Maxpool
#model = model.transform(MakeMaxPoolNHWC())

model = model.transform(absorb.AbsorbTransposeIntoMultiThreshold())
model = model.transform(absorb.AbsorbAddIntoMultiThreshold())
model = model.transform(absorb.AbsorbMulIntoMultiThreshold())

#model = model.transform(ConvertBipolarMatMulToXnorPopcount())
model = model.transform(Streamline())
#absorb final add-mul nodes into TopK
model = model.transform(absorb.AbsorbScalarMulAddIntoTopK())
#infer_data_layouts.py:119: UserWarning: Assuming 4D input is NCHW
model = model.transform(InferDataLayouts())
model = model.transform(RemoveUnusedTensors())
model.save("part1_streamlined.onnx")

In [None]:
showInNetron("part1_streamlined.onnx")

## 3. Partitioning, Conversion to HLS Layers and Folding

The next steps will be (again) very similar to what we did for the TFC-w1a1 network. We'll first convert the layers that we can put into the FPGA into their HLS equivalents and separate them out into a *dataflow partition*:


# TODO
TopK should convert to HLS layer LabelSelectBatch: input should be is_integer!
1st MutiThreshold should convert to HLS layer Thresholding Batch: manipulate transpose and Multithreshold

In [None]:
from finn.transformation.streamline import Streamline

In [None]:
from finn.transformation.fpgadataflow.create_dataflow_partition import (
    CreateDataflowPartition,
)
from finn.transformation.move_reshape import RemoveCNVtoFCFlatten
from qonnx.custom_op.registry import getCustomOp
from qonnx.transformation.infer_data_layouts import InferDataLayouts

# choose the memory mode for the MVTU units, decoupled or const
mem_mode = "decoupled"
model = ModelWrapper("part0_streamlined.onnx")

model = model.transform(to_hls.InferBinaryMatrixVectorActivation(mem_mode))
model = model.transform(to_hls.InferQuantizedMatrixVectorActivation(mem_mode))
# TopK to LabelSelect
model = model.transform(to_hls.InferLabelSelectLayer())
# input quantization (if any) to standalone thresholding
model = model.transform(to_hls.InferThresholdingLayer())
model = model.transform(to_hls.InferConvInpGen())
#model = model.transform(to_hls.InferStreamingMaxPool())
# get rid of Reshape(-1, 1) operation between hlslib nodes
model = model.transform(RemoveCNVtoFCFlatten())
# get rid of Tranpose -> Tranpose identity seq
model = model.transform(absorb.AbsorbConsecutiveTransposes())

# infer tensor data layouts
model = model.transform(InferDataLayouts())
model = model.transform(absorb.AbsorbTransposeIntoMultiThreshold())
"""
#Try InferChannelwiseLinearLayer() for the Mul/Add 
#and InferLabelSelectLayer() for the TopK, 
#followed by GiveUniqueNodeNames() to clean up the model again.
model = model.transform(to_hls.InferChannelwiseLinearLayer())
model = model.transform(to_hls.InferLabelSelectLayer())
model = model.transform(GiveUniqueNodeNames())
model = model.transform(absorb.AbsorbScalarMulAddIntoTopK())
model = model.transform(absorb.AbsorbConsecutiveTransposes())
model = model.transform(to_hls.InferConvInpGen())
model = model.transform(RemoveCNVtoFCFlatten())

test_model_filename = "tinyHAR_test.onnx"
model.save(test_model_filename)
"""


In [None]:
model = model.transform(to_hls.InferChannelwiseLinearLayer())
model = model.transform(to_hls.InferLabelSelectLayer())
model = model.transform(GiveUniqueNodeNames())

In [None]:
parent_model = model.transform(CreateDataflowPartition())
parent_model.save("part0_parent.onnx")


sdp_node = parent_model.get_nodes_by_op_type("StreamingDataflowPartition")[0]
sdp_node = getCustomOp(sdp_node)
dataflow_model_filename = sdp_node.get_nodeattr("model")
# save the dataflow partition with a different name for easier access
dataflow_model = ModelWrapper(dataflow_model_filename)
dataflow_model.save("part0_dataflow_model.onnx")

In [None]:
showInNetron("part0_parent.onnx")

In [None]:
showInNetron("part0_dataflow_model.onnx")

In [None]:
model = ModelWrapper("part0_dataflow_model.onnx")
fc_layers = model.get_nodes_by_op_type("MatrixVectorActivation")
# each tuple is (PE, SIMD, in_fifo_depth,ramstyle) for a layer
folding = [
    (1, 1, 64, "auto"),
    (5, 10, 64, "auto"),
    (5, 10, 64, "auto"),
    (5, 10, 64, "auto")
]
for fcl, (pe, simd, ififodepth,ramstyle) in zip(fc_layers, folding):
    fcl_inst = getCustomOp(fcl)
    fcl_inst.set_nodeattr("PE", pe)
    fcl_inst.set_nodeattr("SIMD", simd)
    fcl_inst.set_nodeattr("inFIFODepth", ififodepth)
    fcl_inst.set_nodeattr("ram_style", ramstyle)

# use same SIMD values for the sliding window operators
swg_layers = model.get_nodes_by_op_type("ConvolutionInputGenerator1D")
for i in range(len(swg_layers)):
    swg_inst = getCustomOp(swg_layers[i])
    simd = folding[i][1]
    swg_inst.set_nodeattr("SIMD", simd)
    swg_inst.set_nodeattr("ram_style", ramstyle)

model = model.transform(GiveUniqueNodeNames())
model.save("part0_dataflow_folded.onnx")

In [None]:
showInNetron("part0_dataflow_folded.onnx")

In [None]:
# print the names of the supported PYNQ boards
from finn.util.basic import pynq_part_map
print(pynq_part_map.keys())

In [None]:
test_pynq_board = "Pynq-Z1"
target_clk_ns = 10

from finn.transformation.fpgadataflow.make_zynq_proj import ZynqBuild
model = ModelWrapper("part0_dataflow_folded.onnx")
model = model.transform(ZynqBuild(platform = test_pynq_board, period_ns = target_clk_ns))

After the `ZynqBuild` we run one additional transformation to generate a PYNQ driver for the accelerator.

In [None]:
from finn.transformation.fpgadataflow.make_pynq_driver import MakePYNQDriver
model = model.transform(MakePYNQDriver("zynq-iodma"))

In [None]:
model.save("part0_synth.onnx")

### part1

In [None]:
from finn.transformation.fpgadataflow.create_dataflow_partition import (
    CreateDataflowPartition,
)
from finn.transformation.move_reshape import RemoveCNVtoFCFlatten
from qonnx.custom_op.registry import getCustomOp
from qonnx.transformation.infer_data_layouts import InferDataLayouts

# choose the memory mode for the MVTU units, decoupled or const
mem_mode = "decoupled"
model = ModelWrapper("part1_streamlined.onnx")

model = model.transform(to_hls.InferBinaryMatrixVectorActivation(mem_mode))
model = model.transform(to_hls.InferQuantizedMatrixVectorActivation(mem_mode))
# TopK to LabelSelect
model = model.transform(to_hls.InferLabelSelectLayer())
# input quantization (if any) to standalone thresholding
model = model.transform(to_hls.InferThresholdingLayer())
model = model.transform(to_hls.InferConvInpGen())
#model = model.transform(to_hls.InferStreamingMaxPool())
# get rid of Reshape(-1, 1) operation between hlslib nodes
model = model.transform(RemoveCNVtoFCFlatten())
# get rid of Tranpose -> Tranpose identity seq
model = model.transform(absorb.AbsorbConsecutiveTransposes())

# infer tensor data layouts
model = model.transform(InferDataLayouts())
model = model.transform(absorb.AbsorbTransposeIntoMultiThreshold())
model = model.transform(to_hls.InferChannelwiseLinearLayer())
model = model.transform(to_hls.InferLabelSelectLayer())
model = model.transform(GiveUniqueNodeNames())

In [None]:
parent_model = model.transform(CreateDataflowPartition())
parent_model.save("part1_parent.onnx")


sdp_node = parent_model.get_nodes_by_op_type("StreamingDataflowPartition")[0]
sdp_node = getCustomOp(sdp_node)
dataflow_model_filename = sdp_node.get_nodeattr("model")
# save the dataflow partition with a different name for easier access
dataflow_model = ModelWrapper(dataflow_model_filename)
dataflow_model.save("part1_dataflow_model.onnx")

In [None]:
model = ModelWrapper("part1_dataflow_model.onnx")
fc_layers = model.get_nodes_by_op_type("MatrixVectorActivation")
# each tuple is (PE, SIMD, in_fifo_depth,ramstyle) for a layer
folding = [
    (10, 10, "auto"),
    (10, 10, "auto"),
]
for fcl, (pe, simd,ramstyle) in zip(fc_layers, folding):
    fcl_inst = getCustomOp(fcl)
    fcl_inst.set_nodeattr("PE", pe)
    fcl_inst.set_nodeattr("SIMD", simd)
    fcl_inst.set_nodeattr("ram_style", ramstyle)

# use same SIMD values for the sliding window operators
swg_layers = model.get_nodes_by_op_type("ConvolutionInputGenerator1D")
for i in range(len(swg_layers)):
    swg_inst = getCustomOp(swg_layers[i])
    simd = folding[i][1]
    swg_inst.set_nodeattr("SIMD", simd)
    swg_inst.set_nodeattr("ram_style", ramstyle)

model = model.transform(GiveUniqueNodeNames())
model.save("part1_dataflow_folded.onnx")

In [None]:
# print the names of the supported PYNQ boards
from finn.util.basic import pynq_part_map
print(pynq_part_map.keys())

In [None]:
test_pynq_board = "Pynq-Z1"
target_clk_ns = 10

from finn.transformation.fpgadataflow.make_zynq_proj import ZynqBuild
model = ModelWrapper("part1_dataflow_folded.onnx")
model = model.transform(ZynqBuild(platform = test_pynq_board, period_ns = target_clk_ns))

# 5.Launch a Build: Only Estimate Reports <a id="build_estimate_report"></a>

First, we'll launch a build that only generates the estimate reports, which does not require any synthesis. Note two things below: how the `generate_outputs` only contains `ESTIMATE_REPORTS`, but also how the `steps` uses a value of `estimate_only_dataflow_steps`. This skips steps like HLS synthesis to provide a quick estimate from analytical models.

In [None]:
import finn.builder.build_dataflow as build
import finn.builder.build_dataflow_config as build_cfg
import os
import shutil

model_file = "cnn_dataflow_model.onnx"

estimates_output_dir = "output_estimates_only"

#Delete previous run results if exist
if os.path.exists(estimates_output_dir):
    shutil.rmtree(estimates_output_dir)
    print("Previous run results deleted!")


cfg_estimates = build.DataflowBuildConfig(
    output_dir          = estimates_output_dir,
    mvau_wwidth_max     = 80,
    target_fps          = 1000000,
    synth_clk_period_ns = 10.0,
    fpga_part           = "xc7z020clg400-1",
    steps               = build_cfg.estimate_only_dataflow_steps,
    generate_outputs=[
        build_cfg.DataflowOutputType.ESTIMATE_REPORTS,
    ]
)

In [None]:
%%time
build.build_dataflow_cfg(model_file, cfg_estimates)

In [None]:
! ls {estimates_output_dir}

In [None]:
! ls {estimates_output_dir}/report

In [None]:
! cat {estimates_output_dir}/report/estimate_network_performance.json

In [None]:
import json
def read_json_dict(filename):
    with open(filename, "r") as f:
        ret = json.load(f)
    return ret

In [None]:
read_json_dict(estimates_output_dir + "/report/estimate_layer_cycles.json")

In [None]:
read_json_dict(estimates_output_dir + "/report/estimate_layer_resources.json")