## Load in RHEED training data

In [1]:
import os
import numpy as np
import matplotlib.pyplot as plt
import h5py
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm
import tensorflow as tf
from qkeras import *
import hls4ml

tf.random.set_seed(0)

2025-05-22 20:20:19.853969: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-05-22 20:20:19.926259: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-05-22 20:20:19.929019: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2025-05-22 20:20:19.929033: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudar





### Useful functions

In [2]:
def print_dict(d, indent=0):
    align = 20
    for key, value in d.items():
        print('  ' * indent + str(key), end='')
        if isinstance(value, dict):
            print()
            print_dict(value, indent+1)
        else:
            print(':' + ' ' * (20 - len(key) - 2 * indent) + str(value))  

In [3]:
def custom_weighted_mse_loss(I, J, n):
    W = tf.pow(I, n)
    squared_diffs = tf.pow(I - J, 2)
    weighted_squared_diffs = W * squared_diffs

    return tf.reduce_mean(weighted_squared_diffs)

## Load the Model

I ran into some issues loading the QKeras model directly so I had to do this weird work around

In [4]:
# Model Architecture QAT
integer_bits = 2
fraction_bits = 6
symmetric = 0
keep_negative = 1

In [5]:
import tensorflow as tf
from qkeras import QConv2DBatchnorm, QActivation, QDense

def build_model(input_shape, total_bits, integer_bits):
    inputs = tf.keras.Input(shape=input_shape)
    
    x = QConv2DBatchnorm(
        filters=6, kernel_size=3, strides=1, padding='valid',
        kernel_quantizer=f"quantized_bits({total_bits}, {integer_bits}, alpha=1)",
        bias_quantizer=f"quantized_bits({total_bits}, {integer_bits}, alpha=1)",
        kernel_initializer='lecun_uniform',
        kernel_regularizer=tf.keras.regularizers.l1(0.0001),
        use_bias=True,
    )(inputs)
    x = QActivation(f"quantized_relu({total_bits}, {integer_bits})")(x)
    x = tf.keras.layers.MaxPool2D(pool_size=4, strides=4)(x)
    
    x = QConv2DBatchnorm(
        filters=16, kernel_size=3, strides=1, padding='valid',
        kernel_quantizer=f"quantized_bits({total_bits}, {integer_bits}, alpha=1)",
        bias_quantizer=f"quantized_bits({total_bits}, {integer_bits}, alpha=1)",
        kernel_initializer='lecun_uniform',
        kernel_regularizer=tf.keras.regularizers.l1(0.0001),
        use_bias=True,
    )(x)
    x = QActivation(f"quantized_relu({total_bits}, {integer_bits})")(x)
    x = tf.keras.layers.MaxPool2D(pool_size=2, strides=2)(x)

    x = QConv2DBatchnorm(
        filters=4, kernel_size=3, strides=1, padding='valid',
        kernel_quantizer=f"quantized_bits({total_bits}, {integer_bits}, alpha=1)",
        bias_quantizer=f"quantized_bits({total_bits}, {integer_bits}, alpha=1)",
        kernel_initializer='lecun_uniform',
        kernel_regularizer=tf.keras.regularizers.l1(0.0001),
        use_bias=True,
    )(x)
    x = QActivation(f"quantized_relu({total_bits}, {integer_bits})")(x)
    x = tf.keras.layers.MaxPool2D(pool_size=2, strides=2)(x)
    
    x = tf.keras.layers.Flatten()(x)
    
    x = QDense(
        units=52,
        kernel_quantizer=f"quantized_bits({total_bits}, {integer_bits}, alpha=1)",
        bias_quantizer=f"quantized_bits({total_bits}, {integer_bits}, alpha=1)",
    )(x)
    x = QActivation(f"quantized_relu({total_bits}, {integer_bits})")(x)
    
    outputs = QDense(
        units=5,
        kernel_quantizer=f"quantized_bits({total_bits}, {integer_bits}, alpha=1)",
        bias_quantizer=f"quantized_bits({total_bits}, {integer_bits}, alpha=1)",
    )(x)
    
    model = tf.keras.Model(inputs=inputs, outputs=outputs)
    return model

In [6]:
input_shape = (48, 48, 1) 
total_bits = 8
integer_bits = 2

# Build the model
model = build_model(input_shape, total_bits, integer_bits)

# Compile the model
model.compile(optimizer='adam', loss=custom_weighted_mse_loss, run_eagerly=True)

# Display the model summary
model.summary()

2025-05-22 20:20:22.077923: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:967] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2025-05-22 20:20:22.078061: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2025-05-22 20:20:22.078126: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcublas.so.11'; dlerror: libcublas.so.11: cannot open shared object file: No such file or directory
2025-05-22 20:20:22.078165: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcublasLt.so.11'; dlerror: libcublasLt.so.11: cannot open shared object file: No such file or directory
2025-05-22 20:20:22.078201: W tensorf

Instructions for updating:
Lambda fuctions will be no more assumed to be used in the statement where they are used, or at least in the same block. https://github.com/tensorflow/tensorflow/issues/56089
Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 48, 48, 1)]       0         
                                                                 
 q_conv2d_batchnorm (QConv2D  (None, 46, 46, 6)        85        
 Batchnorm)                                                      
                                                                 
 q_activation (QActivation)  (None, 46, 46, 6)         0         
                                                                 
 max_pooling2d (MaxPooling2D  (None, 11, 11, 6)        0         
 )                                                               
                                                          

In [7]:
dummy_input = tf.random.normal((1, 48, 48, 1))  
model.predict(dummy_input)  # Call the model once to initialize the weights

# model.load_weights('/home/mattwilk/8bit6fractional/ml4fg/Gaussian_Model_QAT_2I_6F_weights.h5')  # Load Weights



array([[-2.296875 , -0.5566406,  0.5600586, -1.7104492,  1.9775391]],
      dtype=float32)

## HLS4ML Conversion

In [8]:
FP_TOTAL = 8
FP_INT = 2

io_type = 'io_stream'
backend = 'Vitis'
part = 'xcku035-fbva676-2-e'
OUT_DIR = f'rtl_models/model_dummy_2'

config = hls4ml.utils.config_from_keras_model (model,
                                                   default_precision = f"ap_fixed<{FP_TOTAL},{FP_INT}>",
                                                   granularity = 'name') # Import example CNN
config['Model']['Strategy'] = 'Resource'


config["LayerName"]["input_1"]["Precision"] = f"ap_fixed<{FP_TOTAL},{FP_INT}>"


config["LayerName"]["q_conv2d_batchnorm"]["ReuseFactor"] = 50
config["LayerName"]["q_conv2d_batchnorm"]["Strategy"] = "Resource"
config["LayerName"]["q_conv2d_batchnorm"]["Precision"] = f"ap_fixed<{FP_TOTAL},{FP_INT}>"

config["LayerName"]["q_conv2d_batchnorm_1"]["ReuseFactor"] = 150
config["LayerName"]["q_conv2d_batchnorm_1"]["Strategy"] = "Resource"
config["LayerName"]["q_conv2d_batchnorm_1"]["Precision"] = f"ap_fixed<{FP_TOTAL},{FP_INT}>"


config["LayerName"]["q_dense"]["ReuseFactor"] = 288
config["LayerName"]["q_dense"]["Strategy"] = "Resource"
config["LayerName"]["q_dense"]["Precision"] = f"ap_fixed<{FP_TOTAL},{FP_INT}>"

config["LayerName"]["q_dense"]["ReuseFactor"] = 98
config["LayerName"]["q_dense"]["Strategy"] = "Resource"
config["LayerName"]["q_dense"]["Precision"] = f"ap_fixed<{FP_TOTAL},{FP_INT}>"

config["LayerName"]["q_dense_1"]["ReuseFactor"] = 98
config["LayerName"]["q_dense_1"]["Strategy"] = "Resource"
config["LayerName"]["q_dense_1"]["Precision"] = f"ap_fixed<{FP_TOTAL},{FP_INT}>"


print_dict(config)

Interpreting Model
Topology:
Layer name: input_1, layer type: InputLayer, input shapes: [[None, 48, 48, 1]], output shape: [None, 48, 48, 1]
Layer name: q_conv2d_batchnorm, layer type: QConv2DBatchnorm, input shapes: [[None, 48, 48, 1]], output shape: [None, 46, 46, 6]
Layer name: q_activation, layer type: Activation, input shapes: [[None, 46, 46, 6]], output shape: [None, 46, 46, 6]
Layer name: max_pooling2d, layer type: MaxPooling2D, input shapes: [[None, 46, 46, 6]], output shape: [None, 11, 11, 6]
Layer name: q_conv2d_batchnorm_1, layer type: QConv2DBatchnorm, input shapes: [[None, 11, 11, 6]], output shape: [None, 9, 9, 16]
Layer name: q_activation_1, layer type: Activation, input shapes: [[None, 9, 9, 16]], output shape: [None, 9, 9, 16]
Layer name: max_pooling2d_1, layer type: MaxPooling2D, input shapes: [[None, 9, 9, 16]], output shape: [None, 4, 4, 16]
Layer name: q_conv2d_batchnorm_2, layer type: QConv2DBatchnorm, input shapes: [[None, 4, 4, 16]], output shape: [None, 2, 2, 4

Layer name: q_dense_1, layer type: QDense, input shapes: [[None, 52]], output shape: [None, 5]
Model
  Precision
    default:         ap_fixed<8,2>
  ReuseFactor:       1
  Strategy:          Resource
  BramFactor:        1000000000
  TraceOutput:       False
LayerName
  input_1
    Trace:           False
    Precision:       ap_fixed<8,2>
  q_conv2d_batchnorm
    Trace:           False
    Precision:       ap_fixed<8,2>
    ReuseFactor:     50
    Strategy:        Resource
  q_conv2d_batchnorm_linear
    Trace:           False
    Precision
      result:        auto
  q_activation
    Trace:           False
    Precision
      result:        ufixed<8,2,RND_CONV,SAT,0>
  max_pooling2d
    Trace:           False
    Precision
      result:        auto
  q_conv2d_batchnorm_1
    Trace:           False
    Precision:       ap_fixed<8,2>
    ReuseFactor:     150
    Strategy:        Resource
  q_conv2d_batchnorm_1_linear
    Trace:           False
    Precision
      result:        auto
  

In [9]:
hls_model = hls4ml.converters.convert_from_keras_model(model,
                                                       hls_config = config,
                                                       io_type = io_type,
                                                       backend = backend,
                                                       output_dir = OUT_DIR,
                                                       part = part
                                                       )

hls_model.compile()

Interpreting Model
Topology:
Layer name: input_1, layer type: InputLayer, input shapes: [[None, 48, 48, 1]], output shape: [None, 48, 48, 1]
Layer name: q_conv2d_batchnorm, layer type: QConv2DBatchnorm, input shapes: [[None, 48, 48, 1]], output shape: [None, 46, 46, 6]
Layer name: q_activation, layer type: Activation, input shapes: [[None, 46, 46, 6]], output shape: [None, 46, 46, 6]
Layer name: max_pooling2d, layer type: MaxPooling2D, input shapes: [[None, 46, 46, 6]], output shape: [None, 11, 11, 6]
Layer name: q_conv2d_batchnorm_1, layer type: QConv2DBatchnorm, input shapes: [[None, 11, 11, 6]], output shape: [None, 9, 9, 16]
Layer name: q_activation_1, layer type: Activation, input shapes: [[None, 9, 9, 16]], output shape: [None, 9, 9, 16]
Layer name: max_pooling2d_1, layer type: MaxPooling2D, input shapes: [[None, 9, 9, 16]], output shape: [None, 4, 4, 16]
Layer name: q_conv2d_batchnorm_2, layer type: QConv2DBatchnorm, input shapes: [[None, 4, 4, 16]], output shape: [None, 2, 2, 4

In [10]:
hls_model.predict(np.full((1, 48, 48), 7).astype(float))

array([-0.890625, -0.09375 , -0.015625, -0.59375 ,  0.578125])

In [13]:
hls_model.build(csim=False, synth=True, vsynth=True)


****** Vitis HLS - High-Level Synthesis from C, C++ and OpenCL v2022.2 (64-bit)
  **** SW Build 3670227 on Oct 13 2022
  **** IP Build 3669848 on Fri Oct 14 08:30:02 MDT 2022
    ** Copyright 1986-2022 Xilinx, Inc. All Rights Reserved.

source /tools/Xilinx/Vitis_HLS/2022.2/scripts/vitis_hls/hls.tcl -notrace
INFO: [HLS 200-10] Running '/tools/Xilinx/Vitis_HLS/2022.2/bin/unwrapped/lnx64.o/vitis_hls'
INFO: [HLS 200-10] For user 'aelabd' on host 'DESKTOP-Q0UCNGC.' (Linux_x86_64 version 5.15.133.1-microsoft-standard-WSL2) on Thu May 22 20:20:38 CEST 2025
INFO: [HLS 200-10] On os Ubuntu 24.04 LTS
INFO: [HLS 200-10] In directory '/home/aelabd/RHEED/CoaxlinkQuadCxp12_1cam/rtl_models/model_dummy_2'
Sourcing Tcl script 'build_prj.tcl'
INFO: [HLS 200-1510] Running: open_project myproject_prj 
INFO: [HLS 200-10] Opening project '/home/aelabd/RHEED/CoaxlinkQuadCxp12_1cam/rtl_models/model_dummy_2/myproject_prj'.
INFO: [HLS 200-1510] Running: set_top myproject 
INFO: [HLS 200-1510] Running: add_fil

{'CSynthesisReport': {'TargetClockPeriod': '5.00',
  'EstimatedClockPeriod': '4.876',
  'BestLatency': '140514',
  'WorstLatency': '140609',
  'IntervalMin': '9218',
  'IntervalMax': '140546',
  'BRAM_18K': '56',
  'DSP': '0',
  'FF': '11975',
  'LUT': '33215',
  'URAM': '0',
  'AvailableBRAM_18K': '1080',
  'AvailableDSP': '1700',
  'AvailableFF': '406256',
  'AvailableLUT': '203128',
  'AvailableURAM': '0'},
 'VivadoSynthReport': {'LUT': '12655',
  'FF': '11631',
  'BRAM_18K': '10',
  'DSP48E': '0'}}

In [14]:
hls4ml.report.read_vivado_report(hls_model.config.config['OutputDir'])

Found 1 solution(s) in rtl_models/model_dummy_2/myproject_prj.
Reports for solution "solution1":

C simulation report not found.
SYNTHESIS REPORT:
== Vitis HLS Report for 'myproject'
* Date:           Thu May 22 20:21:58 2025

* Version:        2022.2 (Build 3670227 on Oct 13 2022)
* Project:        myproject_prj
* Solution:       solution1 (Vivado IP Flow Target)
* Product family: kintexu
* Target device:  xcku035-fbva676-2-e


== Performance Estimates
+ Timing: 
    * Summary: 
    +--------+---------+----------+------------+
    |  Clock |  Target | Estimated| Uncertainty|
    +--------+---------+----------+------------+
    |ap_clk  |  5.00 ns|  4.876 ns|     1.35 ns|
    +--------+---------+----------+------------+

+ Latency: 
    * Summary: 
    +---------+---------+----------+----------+------+--------+----------+
    |  Latency (cycles) |  Latency (absolute) |    Interval   | Pipeline |
    |   min   |   max   |    min   |    max   |  min |   max  |   Type   |
    +---------+-