**import onnxruntime (again) to avoid version error**

In [2]:
import onnx 
import torch 
import onnxruntime
import os
    
build_dir = os.environ["FINN_BUILD_DIR"]

version check for debugging 

In [3]:
print(onnx.__version__)
print(onnxruntime.__version__)

1.11.0
1.11.1


## 1. Import model into FINN with ModelWrapper

In [4]:
from qonnx.core.modelwrapper import ModelWrapper
from qonnx.util.cleanup import cleanup_model
from qonnx.transformation.fold_constants import FoldConstants

ready_model_filename = "pamp2_attn_conv_4_mod.onnx"
model_for_sim = ModelWrapper(ready_model_filename)

model_for_sim = cleanup_model(model_for_sim)

                i.e. domain=finn to domain=qonnx.custom_op.<general|fpgadataflow|...>


In [4]:
def count_equal_nodes(model):
    count_dict = {}
    for node in model.graph.node:
        if node.op_type in count_dict:
            count_dict[node.op_type] +=1
        else:
            count_dict[node.op_type] = 1
    return count_dict

In [5]:
print(model_for_sim.analysis(count_equal_nodes))

{'MultiThreshold': 15, 'Add': 45, 'Mul': 171, 'Conv': 6, 'BatchNormalization': 4, 'Transpose': 148, 'Gather': 37, 'MatMul': 184, 'Softmax': 37, 'Unsqueeze': 37, 'Concat': 1, 'Reshape': 1, 'Squeeze': 1, 'ReduceSum': 1}


In [6]:
from finn.util.visualization import showInNetron

In [7]:
showInNetron("pamp2_attn_conv_4_mod.onnx")

Serving 'pamp2_attn_conv_4_mod.onnx' at http://0.0.0.0:8081


## 2. Network preparation: Tidy-up transformations <a id="network_preparations"></a>

all the intermediate tensors need to have statically defined shapes.

These transformations are:

- GiveUniqueNodeNames
- GiveReadableTensorNames
- InferShapes
- InferDataTypes
- FoldConstants
- RemoveStaticGraphInputs


In [8]:
from qonnx.transformation.general import GiveReadableTensorNames, GiveUniqueNodeNames, RemoveStaticGraphInputs
from qonnx.transformation.infer_shapes import InferShapes
from qonnx.transformation.infer_datatypes import InferDataTypes
from qonnx.transformation.fold_constants import FoldConstants

model_for_sim = model_for_sim.transform(InferShapes())
model_for_sim = model_for_sim.transform(FoldConstants())
model_for_sim = model_for_sim.transform(GiveUniqueNodeNames())
model_for_sim = model_for_sim.transform(GiveReadableTensorNames())
model_for_sim = model_for_sim.transform(InferDataTypes())
model_for_sim = model_for_sim.transform(RemoveStaticGraphInputs())

tidy_model_filename = "pamp2_tidy.onnx"
model_for_sim.save(tidy_model_filename)

In [9]:
showInNetron(tidy_model_filename)

Stopping http://0.0.0.0:8081
Serving 'pamp2_tidy.onnx' at http://0.0.0.0:8081


### Adding Pre- and Postprocessing <a id='prepost'></a>

Preprocessing and postprocessing steps can be added directly in the ONNX graph.  <span style="color:red">
In FINN, we can bake some of these pre/postprocessing operatings into the graph, and in some cases these can be highly beneficial for performance by allowing our accelerator to directly consume raw data instead of going through CPU preprocessing.

In [10]:
from qonnx.transformation.insert_topk import InsertTopK
from qonnx.transformation.infer_datatypes import InferDataTypes

# postprocessing: insert Top-1 node at the end
model = ModelWrapper(tidy_model_filename)
model = model.transform(InsertTopK(k=1))
chkpt_name = "pamp2_pre_post.onnx"
# tidy-up again
model = model.transform(InferShapes())
model = model.transform(FoldConstants())
model = model.transform(GiveUniqueNodeNames())
model = model.transform(GiveReadableTensorNames())
model = model.transform(InferDataTypes())
model = model.transform(RemoveStaticGraphInputs())
model.save(chkpt_name)

In [11]:
showInNetron(chkpt_name)

Stopping http://0.0.0.0:8081
Serving 'pamp2_pre_post.onnx' at http://0.0.0.0:8081


### manually partition

In [12]:
"""
manually partiton
"""
from qonnx.core.modelwrapper import ModelWrapper

for n in model.graph.node:
    if n.name == 'Add_1':
        add_1 = n
        print("Add_1 " + str(model_for_sim.get_node_index(n)))
    elif n.name == 'MatMul_3':
        matMul_3 = n
        print("MatMul_3 " + str(model_for_sim.get_node_index(n)))
    elif n.name == 'Reshape_0':
        reshape_0 = n
        print("Reshape_0 " + str(model_for_sim.get_node_index(n)))
    elif n.name == 'Transpose_0':
        transpose_0 = n
        print("Transpose_0 " + str(model_for_sim.get_node_index(n)))
    elif n.name == 'Transpose_145':
        transpose_145 = n
        print("Transpose_145 " + str(model_for_sim.get_node_index(n)))
    elif n.name == 'Transpose_147':
        transpose_147 = n
        print("Transpose_147 " + str(model_for_sim.get_node_index(n)))    

Add_1 9
Transpose_0 35
MatMul_3 76
Transpose_145 649
Reshape_0 650
Transpose_147 667


In [13]:
#partition_dir
import qonnx.transformation.create_generic_partitions as partition
import finn.transformation.fpgadataflow.cleanup as cleanup

# Set partition_dir to the current directory
partition_dir = "/home/bian/finn/notebooks/TinyHAR/e.PAMP2"

model = ModelWrapper(chkpt_name)
model = model.transform(cleanup.CleanUp())
model = model.transform(InferShapes())

model = model.transform(partition.PartitionFromDict({0 : range(0, 35), 1 : range(649, 667)}, partition_dir = partition_dir))

save as file /home/bian/finn/notebooks/TinyHAR/e.PAMP2/partition_0.onnx
save as file /home/bian/finn/notebooks/TinyHAR/e.PAMP2/partition_1.onnx


In [14]:
model.save("pamp2_partition.onnx")

## part0

### 2. Lowering and Streamlining

In [15]:
from finn.transformation.streamline import Streamline
from qonnx.transformation.lower_convs_to_matmul import LowerConvsToMatMul
from qonnx.transformation.bipolar_to_xnor import ConvertBipolarMatMulToXnorPopcount
import finn.transformation.streamline.absorb as absorb
import finn.transformation.streamline.reorder as reorder
from finn.transformation.streamline.reorder import MakeMaxPoolNHWC, MoveScalarLinearPastInvariants
from qonnx.transformation.infer_data_layouts import InferDataLayouts
from qonnx.transformation.general import RemoveUnusedTensors

import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls

model = ModelWrapper("partition_0.onnx")
model = model.transform(Streamline())
#conv1d, shape kernel not 2d
model = model.transform(LowerConvsToMatMul())
#TinyHAR has no Maxpool
#model = model.transform(MakeMaxPoolNHWC())

model = model.transform(absorb.AbsorbTransposeIntoMultiThreshold())

#model = model.transform(ConvertBipolarMatMulToXnorPopcount())
model = model.transform(Streamline())
#absorb final add-mul nodes into TopK
model = model.transform(absorb.AbsorbScalarMulAddIntoTopK())
#infer_data_layouts.py:119: UserWarning: Assuming 4D input is NCHW
model = model.transform(InferDataLayouts())
model = model.transform(RemoveUnusedTensors())
model.save("pamp2_part0_streamlined.onnx")



In [16]:
showInNetron("pamp2_part0_streamlined.onnx")

Stopping http://0.0.0.0:8081
Serving 'pamp2_part0_streamlined.onnx' at http://0.0.0.0:8081


### 3. Partitioning, Conversion to HLS Layers and Folding

In [17]:
from finn.transformation.streamline import Streamline

In [18]:
from finn.transformation.fpgadataflow.create_dataflow_partition import (
    CreateDataflowPartition,
)
from finn.transformation.move_reshape import RemoveCNVtoFCFlatten
from qonnx.custom_op.registry import getCustomOp
from qonnx.transformation.infer_data_layouts import InferDataLayouts

# choose the memory mode for the MVTU units, decoupled or const
mem_mode = "decoupled"
model = ModelWrapper("pamp2_part0_streamlined.onnx")

model = model.transform(to_hls.InferBinaryMatrixVectorActivation(mem_mode))
model = model.transform(to_hls.InferQuantizedMatrixVectorActivation(mem_mode))
# TopK to LabelSelect
model = model.transform(to_hls.InferLabelSelectLayer())
# input quantization (if any) to standalone thresholding
model = model.transform(to_hls.InferThresholdingLayer())
model = model.transform(to_hls.InferConvInpGen())
#model = model.transform(to_hls.InferStreamingMaxPool())
# get rid of Reshape(-1, 1) operation between hlslib nodes
model = model.transform(RemoveCNVtoFCFlatten())
# get rid of Tranpose -> Tranpose identity seq
model = model.transform(absorb.AbsorbConsecutiveTransposes())

# infer tensor data layouts
model = model.transform(InferDataLayouts())
model = model.transform(absorb.AbsorbTransposeIntoMultiThreshold())

node MultiThreshold_0 input not integer
node MultiThreshold_0 input not integer


In [19]:
model = model.transform(to_hls.InferChannelwiseLinearLayer())
model = model.transform(to_hls.InferLabelSelectLayer())
model = model.transform(GiveUniqueNodeNames())



In [20]:
parent_model = model.transform(CreateDataflowPartition())
parent_model.save("pamp2_part0_parent.onnx")


sdp_node = parent_model.get_nodes_by_op_type("StreamingDataflowPartition")[0]
sdp_node = getCustomOp(sdp_node)
dataflow_model_filename = sdp_node.get_nodeattr("model")
# save the dataflow partition with a different name for easier access
dataflow_model = ModelWrapper(dataflow_model_filename)
dataflow_model.save("pamp2_part0_dataflow_model.onnx")

save as file /tmp/finn_dev_bian/dataflow_partition_9vtmbrbq/partition_0.onnx


In [21]:
showInNetron("pamp2_part0_parent.onnx")

Stopping http://0.0.0.0:8081
Serving 'pamp2_part0_parent.onnx' at http://0.0.0.0:8081


In [22]:
showInNetron("pamp2_part0_dataflow_model.onnx")

Stopping http://0.0.0.0:8081
Serving 'pamp2_part0_dataflow_model.onnx' at http://0.0.0.0:8081


In [23]:
model = ModelWrapper("pamp2_part0_dataflow_model.onnx")
fc_layers = model.get_nodes_by_op_type("MatrixVectorActivation")
# each tuple is (PE, SIMD, in_fifo_depth,ramstyle) for a layer
folding = [
    (1, 1, 64, "auto"),
    (5, 10, 64, "auto"),
    (5, 10, 64, "auto"),
    (5, 10, 64, "auto")
]
for fcl, (pe, simd, ififodepth,ramstyle) in zip(fc_layers, folding):
    fcl_inst = getCustomOp(fcl)
    fcl_inst.set_nodeattr("PE", pe)
    fcl_inst.set_nodeattr("SIMD", simd)
    fcl_inst.set_nodeattr("inFIFODepth", ififodepth)
    fcl_inst.set_nodeattr("ram_style", ramstyle)

# use same SIMD values for the sliding window operators
swg_layers = model.get_nodes_by_op_type("ConvolutionInputGenerator1D")
for i in range(len(swg_layers)):
    swg_inst = getCustomOp(swg_layers[i])
    simd = folding[i][1]
    swg_inst.set_nodeattr("SIMD", simd)
    swg_inst.set_nodeattr("ram_style", ramstyle)

model = model.transform(GiveUniqueNodeNames())
model.save("pamp2_part0_dataflow_folded.onnx")

In [24]:
showInNetron("pamp2_part0_dataflow_folded.onnx")

Stopping http://0.0.0.0:8081
Serving 'pamp2_part0_dataflow_folded.onnx' at http://0.0.0.0:8081


### 4.Hardware Generation

In [1]:
# print the names of the supported PYNQ boards
from finn.util.basic import pynq_part_map
print(pynq_part_map.keys())

dict_keys(['Ultra96', 'Pynq-Z1', 'Pynq-Z2', 'ZCU102', 'ZCU104', 'ZCU111', 'RFSoC2x2'])


In [5]:
test_pynq_board = "ZCU104"
target_clk_ns = 10

from finn.transformation.fpgadataflow.make_zynq_proj import ZynqBuild
model = ModelWrapper("pamp2_part0_dataflow_folded.onnx")
model = model.transform(ZynqBuild(platform = test_pynq_board, period_ns = target_clk_ns))

save as file /tmp/finn_dev_bian/dataflow_partition_rdcg6ymv/partition_0.onnx
save as file /tmp/finn_dev_bian/dataflow_partition_rdcg6ymv/partition_1.onnx
save as file /tmp/finn_dev_bian/dataflow_partition_rdcg6ymv/partition_2.onnx




After the `ZynqBuild` we run one additional transformation to generate a PYNQ driver for the accelerator.

In [6]:
from finn.transformation.fpgadataflow.make_pynq_driver import MakePYNQDriver
model = model.transform(MakePYNQDriver("zynq-iodma"))

In [7]:
model.save("pamp2_part0_synth.onnx")

### dataflow_performance

In [8]:
from qonnx.core.modelwrapper import ModelWrapper
import finn.analysis.fpgadataflow.dataflow_performance as performance

In [9]:
import finn.builder.build_dataflow as build
import finn.builder.build_dataflow_config as build_cfg
import os
import shutil

model_file = "pamp2_part0_dataflow_folded.onnx"

estimates_output_dir = "output_estimates_only"

#Delete previous run results if exist
if os.path.exists(estimates_output_dir):
    shutil.rmtree(estimates_output_dir)
    print("Previous run results deleted!")


cfg_estimates = build.DataflowBuildConfig(
    output_dir          = estimates_output_dir,
    mvau_wwidth_max     = 80,
    target_fps          = 1000000,
    synth_clk_period_ns = 10.0,
    fpga_part           = "xc7z020clg400-1",
    steps               = build_cfg.estimate_only_dataflow_steps,
    generate_outputs=[
        build_cfg.DataflowOutputType.ESTIMATE_REPORTS,
    ]
)

Previous run results deleted!


In [10]:
%%time
build.build_dataflow_cfg(model_file, cfg_estimates)

Building dataflow accelerator from pamp2_part0_dataflow_folded.onnx
Intermediate outputs will be generated in /tmp/finn_dev_bian
Final outputs will be generated in output_estimates_only
Build log is at output_estimates_only/build_dataflow.log
Running step: step_qonnx_to_finn [1/8]
Running step: step_tidy_up [2/8]
Running step: step_streamline [3/8]
Running step: step_convert_to_hls [4/8]
Running step: step_create_dataflow_partition [5/8]
Running step: step_target_fps_parallelization [6/8]
Running step: step_apply_folding_config [7/8]
Running step: step_generate_estimate_reports [8/8]
Completed successfully
CPU times: user 862 ms, sys: 32.5 ms, total: 894 ms
Wall time: 822 ms


0

In [32]:
! ls {estimates_output_dir}

auto_folding_config.json  intermediate_models  time_per_step.json
build_dataflow.log	  report


In [33]:
! ls {estimates_output_dir}/report

estimate_layer_config_alternatives.json  estimate_network_performance.json
estimate_layer_cycles.json		 op_and_param_counts.json
estimate_layer_resources.json


In [34]:
! cat {estimates_output_dir}/report/estimate_network_performance.json

{
  "critical_path_cycles": 52510,
  "max_cycles": 7200,
  "max_cycles_node_name": "MatrixVectorActivation_1",
  "estimated_throughput_fps": 13888.888888888889,
  "estimated_latency_ns": 525100.0
}

## Part 1

### 2.Lowering and Streamlining¶

In [None]:
from finn.transformation.streamline import Streamline
from qonnx.transformation.lower_convs_to_matmul import LowerConvsToMatMul
from qonnx.transformation.bipolar_to_xnor import ConvertBipolarMatMulToXnorPopcount
import finn.transformation.streamline.absorb as absorb
import finn.transformation.streamline.reorder as reorder
from qonnx.transformation.change_3d_tensors_to_4d import Change3DTo4DTensors
from finn.transformation.streamline.reorder import MakeMaxPoolNHWC, MoveScalarLinearPastInvariants
from qonnx.transformation.infer_data_layouts import InferDataLayouts
from qonnx.transformation.general import RemoveUnusedTensors

import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls

model = ModelWrapper("partition_1.onnx")

#conv 1d
model = model.transform(Change3DTo4DTensors())
model = model.transform(absorb.AbsorbScalarMulAddIntoTopK())

#conv1d, shape kernel not 2d
model = model.transform(LowerConvsToMatMul())
#TinyHAR has no Maxpool
#model = model.transform(MakeMaxPoolNHWC())

model = model.transform(absorb.AbsorbTransposeIntoMultiThreshold())

#conv1d, shape kernel not 2d
model = model.transform(LowerConvsToMatMul())
#TinyHAR has no Maxpool
#model = model.transform(MakeMaxPoolNHWC())

model = model.transform(absorb.AbsorbTransposeIntoMultiThreshold())
model = model.transform(absorb.AbsorbAddIntoMultiThreshold())
model = model.transform(absorb.AbsorbMulIntoMultiThreshold())

#model = model.transform(ConvertBipolarMatMulToXnorPopcount())
model = model.transform(Streamline())
#absorb final add-mul nodes into TopK
model = model.transform(absorb.AbsorbScalarMulAddIntoTopK())
#infer_data_layouts.py:119: UserWarning: Assuming 4D input is NCHW
model = model.transform(InferDataLayouts())
model = model.transform(RemoveUnusedTensors())
model.save("pamp2_part1_streamlined.onnx")

### 3.Partitioning, Conversion to HLS Layers and Folding

In [None]:
from finn.transformation.fpgadataflow.create_dataflow_partition import (
    CreateDataflowPartition,
)
from finn.transformation.move_reshape import RemoveCNVtoFCFlatten
from qonnx.custom_op.registry import getCustomOp
from qonnx.transformation.infer_data_layouts import InferDataLayouts

# choose the memory mode for the MVTU units, decoupled or const
mem_mode = "decoupled"
model = ModelWrapper("part1_streamlined.onnx")

model = model.transform(to_hls.InferBinaryMatrixVectorActivation(mem_mode))
model = model.transform(to_hls.InferQuantizedMatrixVectorActivation(mem_mode))
# TopK to LabelSelect
model = model.transform(to_hls.InferLabelSelectLayer())
# input quantization (if any) to standalone thresholding
model = model.transform(to_hls.InferThresholdingLayer())
model = model.transform(to_hls.InferConvInpGen())
#model = model.transform(to_hls.InferStreamingMaxPool())
# get rid of Reshape(-1, 1) operation between hlslib nodes
model = model.transform(RemoveCNVtoFCFlatten())
# get rid of Tranpose -> Tranpose identity seq
model = model.transform(absorb.AbsorbConsecutiveTransposes())

# infer tensor data layouts
model = model.transform(InferDataLayouts())
model = model.transform(absorb.AbsorbTransposeIntoMultiThreshold())
model = model.transform(to_hls.InferChannelwiseLinearLayer())
model = model.transform(to_hls.InferLabelSelectLayer())
model = model.transform(GiveUniqueNodeNames())

In [None]:
parent_model = model.transform(CreateDataflowPartition())
parent_model.save("part1_parent.onnx")


sdp_node = parent_model.get_nodes_by_op_type("StreamingDataflowPartition")[0]
sdp_node = getCustomOp(sdp_node)
dataflow_model_filename = sdp_node.get_nodeattr("model")
# save the dataflow partition with a different name for easier access
dataflow_model = ModelWrapper(dataflow_model_filename)
dataflow_model.save("part1_dataflow_model.onnx")

In [None]:
model = ModelWrapper("part1_dataflow_model.onnx")
fc_layers = model.get_nodes_by_op_type("MatrixVectorActivation")
# each tuple is (PE, SIMD, in_fifo_depth,ramstyle) for a layer
folding = [
    (10, 10, "auto"),
    (10, 10, "auto"),
]
for fcl, (pe, simd,ramstyle) in zip(fc_layers, folding):
    fcl_inst = getCustomOp(fcl)
    fcl_inst.set_nodeattr("PE", pe)
    fcl_inst.set_nodeattr("SIMD", simd)
    fcl_inst.set_nodeattr("ram_style", ramstyle)

# use same SIMD values for the sliding window operators
swg_layers = model.get_nodes_by_op_type("ConvolutionInputGenerator1D")
for i in range(len(swg_layers)):
    swg_inst = getCustomOp(swg_layers[i])
    simd = folding[i][1]
    swg_inst.set_nodeattr("SIMD", simd)
    swg_inst.set_nodeattr("ram_style", ramstyle)

model = model.transform(GiveUniqueNodeNames())
model.save("part1_dataflow_folded.onnx")

In [None]:
# print the names of the supported PYNQ boards
from finn.util.basic import pynq_part_map
print(pynq_part_map.keys())

In [None]:
test_pynq_board = "Pynq-Z1"
target_clk_ns = 10

from finn.transformation.fpgadataflow.make_zynq_proj import ZynqBuild
model = ModelWrapper("part1_dataflow_folded.onnx")
model = model.transform(ZynqBuild(platform = test_pynq_board, period_ns = target_clk_ns))