-
Notifications
You must be signed in to change notification settings - Fork 60
/
Copy pathmax7800x.py
3615 lines (3310 loc) · 188 KB
/
max7800x.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
###################################################################################################
# Copyright (C) 2019-2023 Maxim Integrated Products, Inc. All Rights Reserved.
#
# Maxim Integrated Products, Inc. Default Copyright Notice:
# https://www.maximintegrated.com/en/aboutus/legal/copyrights.html
###################################################################################################
"""
Backend for MAX7800X embedded code generation and RTL simulations
"""
import copy
import hashlib
import os
import sys
from typing import List, Tuple
import numpy as np
from izer import (apbaccess, assets, compute, console, datamem, kbias, kdedup, kernels, latency,
load, op, rtlsim, state, stats)
from izer import tornadocnn as tc
from izer.eprint import eprint, nprint, wprint
from izer.names import layer_pfx, layer_str
from izer.simulate import (conv1d_layer, conv2d_layer, convtranspose2d_layer, eltwise_layer,
passthrough_layer, pooling_layer, print_data, show_data)
from izer.utils import ffs, fls, overlap, plural, popcount
from . import backend
class Backend(backend.Backend):
"""
Backend for MAX7800X CNN network code generation
"""
def create_net(self) -> str: # pylint: disable=too-many-locals,too-many-branches
"""
Chain multiple CNN layers, create and save input and output
"""
# Cache variables locally for faster access
activation = state.activation
allow_streaming = state.allow_streaming
apb_base = state.apb_base
api_filename = state.api_filename
avg_pool_rounding = state.avg_pool_rounding
base_directory = state.base_directory
bias = state.bias
bias_group_map = state.bias_group_map
big_data = state.big_data
block_mode = state.block_mode
board_name = state.board_name
buffer_insert = state.buffer_insert
buffer_shift = state.buffer_shift
bypass = state.bypass
c_filename = state.c_filename
calcx4 = state.calcx4
compact_data = state.compact_data
conv_groups = state.conv_groups
data = state.data
data_buffer = state.data_buffer
data_buffer_cfg = state.data_buffer_cfg
debug_new_streaming = state.debug_new_streaming
debug_snoop = state.debug_snoop
dilation = state.dilation
eltwise = state.eltwise
embedded_code = state.embedded_code
ext_rdy = state.ext_rdy
avgpool_reset_layer = state.avgpool_reset_layer
fast_fifo = state.fast_fifo
fast_fifo_quad = state.fast_fifo_quad
fifo = state.fifo
final_layer = state.final_layer
first_layer_used = state.first_layer_used
flatten = state.flatten
forever = state.forever
ignore_bias_groups = state.ignore_bias_groups
in_offset = state.in_offset
in_sequences = state.in_sequences
increase_delta1 = state.increase_delta1
increase_delta2 = state.increase_delta2
increase_start = state.increase_start
init_tram = state.init_tram
input_chan = state.input_channels
input_channel_skip = state.input_channel_skip
input_csv = state.input_csv
input_dim = state.input_dim
input_crop = state.input_crop
input_skip = state.input_skip
kernel = state.weights
kernel_size = state.kernel_size
layers = state.layers
legacy_test = state.legacy_test
link_layer = state.link_layer
log = state.log
log_filename = state.log_filename
log_intermediate = state.log_intermediate
log_pooling = state.log_pooling
measure_energy = state.measure_energy
next_sequence = state.next_sequence
no_error_stop = state.no_error_stop
oneshot = state.oneshot
operands = state.operands
operator = state.operator
out_offset = state.out_offset
output_chan = state.output_channels
output_dim = state.output_dim
output_size = list(zip(output_chan, (output_dim[x][0] for x in range(len(output_dim))),
(output_dim[x][1] for x in range(len(output_dim)))))
output_filename = state.output_filename
output_layer = state.output_layer
output_padding = state.output_padding
output_processor_map = state.output_processor_map
output_shift = state.output_shift
output_width = state.output_width
override_delta1 = state.override_delta1
override_delta2 = state.override_delta2
override_rollover = state.override_rollover
override_start = state.override_start
overwrite = state.overwrite
overwrite_ok = state.overwrite_ok
padding = state.padding
pool = state.pool
pool_average = state.pool_average
pool_dilation = state.pool_dilation
pool_first = state.pool_first
pool_stride = state.pool_stride
pooled_dim = state.pooled_dim
powerdown = state.powerdown
prefix = state.prefix
pretend_zero_sram = state.pretend_zero_sram
prev_sequence = state.prev_sequence
processor_map = state.processor_map
quantization = state.quantization
rd_ahead = state.read_ahead
repeat_layers = state.repeat_layers
reshape_inputs = state.reshape_inputs
riscv = state.riscv
riscv_cache = state.riscv_cache
riscv_flash = state.riscv_flash
simple1b = state.simple1b
simulated_sequence = state.simulated_sequence
snoop = state.snoop
snoop_sequence = state.snoop_sequence
start_layer = state.start_layer
stopstart = state.stopstart
streaming = state.streaming
stride = state.stride
tcalc = state.tcalc
test_bist = state.test_bist
timeout = state.timeout
timer = state.timer
verbose = state.verbose
verify_kernels = state.verify_kernels
verify_writes = state.verify_writes
weight_filename = state.weight_filename
write_gap = state.write_gap
write_zero_regs = state.write_zero_regs
zero_sram = state.zero_sram
zero_unused = state.zero_unused
if not os.path.isdir('assets'):
eprint('The assets folder is missing from the current directory.')
assert tc.dev is not None
device = tc.dev.device
in_expand = [0] * layers
in_expand_invol = [0] * layers
out_expand = [0] * layers
in_expand_thresh = [0] * layers
out_expand_thresh = [0] * layers
tram_max = [0] * layers
timeslots = [1] * layers
hw_padding = padding.copy()
input_dim_str = [None] * layers
output_dim_str = [None] * layers
kernel_size_str = [None] * layers
pool_str = [None] * layers
padding_str = [None] * layers
pool_stride_str = [None] * layers
pool_dilation_str = [None] * layers
dilation_str = [None] * layers
stride_str = [None] * layers
stream_buf = [None] * layers
out_ignore = [0] * layers
out_pad = [0] * layers
hw_add_layers = [0] * layers
hw_flatten = [False] * layers
flatten_prod = [0] * layers
sum_hw_layers = 0
rollover = [None] * layers
all_outputs_map = None
terminating_layer = final_layer
for i, s in enumerate(simulated_sequence):
if s == -1:
terminating_layer = i
break
if zero_sram:
state.rtl_preload = False
if start_layer > 0 and not tc.dev.SUPPORT_LINK_LAYER:
eprint('`--start-layer` is not supported on this device.')
if start_layer > tc.dev.MAX_START_LAYER:
eprint(f'`--start-layer` is set to {start_layer}, but the device only supports '
f'a maximum of {tc.dev.MAX_START_LAYER}.')
if link_layer and not tc.dev.SUPPORT_LINK_LAYER:
eprint('`--link-layer` is not supported on this device.')
if any(rd_ahead) and not tc.dev.SUPPORT_READ_AHEAD:
eprint('`readahead` is not supported on this device.')
if any(calcx4) and not tc.dev.SUPPORT_CALCX4:
eprint('`calcx4` is not supported on this device.')
if state.pipeline and not tc.dev.SUPPORT_PIPELINE:
eprint('`--pipeline` is not supported on this device.')
if state.pll and not tc.dev.SUPPORT_PLL:
eprint('`--pll` is not supported on this device.')
if state.fifo_go and not tc.dev.SUPPORT_FIFO_GO:
eprint('`--fifo-go` is not supported on this device.')
if snoop is not None and not tc.dev.SUPPORT_SNOOP:
eprint('`snoop` is not supported on this device.')
if oneshot and not tc.dev.SUPPORT_ONESHOT:
eprint('`--one-shot` is not supported on this device.')
if state.pipeline is None: # Turn the pipeline on by default
state.pipeline = tc.dev.SUPPORT_PIPELINE
pipeline = state.pipeline # Cache
if state.pll is None: # Turn the PLL on by default
state.pll = tc.dev.SUPPORT_PLL
if not state.balance_power and not state.pll:
eprint('`--max-speed` requires `--pll` or `--pipeline`.')
clock_speed = tc.dev.PLL_SPEED if state.pll else tc.dev.APB_SPEED
if state.clock_divider is None:
if pipeline:
state.clock_divider = 1
else:
# Pick smallest working clock divider
cdiv = (clock_speed + tc.dev.MAX_NO_PIPELINE_SPEED - 1) \
// tc.dev.MAX_NO_PIPELINE_SPEED
# Round up to the next power of 2
cdiv -= 1
cdiv |= cdiv >> 1
cdiv |= cdiv >> 2
cdiv |= cdiv >> 4
cdiv |= cdiv >> 8
cdiv += 1
state.clock_divider = cdiv
if clock_speed // state.clock_divider > tc.dev.MAX_NO_PIPELINE_SPEED and not pipeline:
wprint(f'For a CNN clock speed of {clock_speed} MHz, the pipeline must be enabled.')
elif clock_speed // state.clock_divider <= tc.dev.MAX_NO_PIPELINE_SPEED and pipeline:
nprint(f'For a CNN clock speed of {clock_speed} MHz, the pipeline can be disabled.')
if state.clock_divider > tc.dev.MAX_CNNCLKDIV:
nprint(f'The clock divider of {state.clock_divider} exceeds the device maximum '
f'({tc.dev.MAX_CNNCLKDIV}).')
if test_bist and (zero_sram or pretend_zero_sram):
# Clear every seventh kernel so we can test the BIST
for i, _ in enumerate(kernel):
if kernel[i] is not None:
kernel[i][::7] = \
np.full(shape=kernel[i][0].shape, fill_value=0, dtype=np.int64)
if state.result_output and (state.mlator or oneshot or stopstart):
state.result_output = False
result_output = state.result_output # Cache
if result_output:
state.max_count = None
if (state.rtl_preload or state.rtl_preload_weights or state.result_output) \
and not tc.dev.SUPPORT_SIM_PRELOAD:
eprint('`--rtl-preload` and `--result-output` are not supported on this device.')
if embedded_code and any(calcx4) and not state.new_kernel_loader:
wprint('Enabling --new-kernel-loader since calcx4 is used.')
state.new_kernel_loader = True
state.compact_weights = False
if not state.new_kernel_loader and state.mexpress:
if any(calcx4):
wprint('Ignoring --mexpress since calcx4 is used.')
state.mexpress = False
else:
state.compact_weights = True
mexpress = state.mexpress
compact_weights = state.compact_weights
# Check streaming and FIFO constraints
fifo_group = fast_fifo
if not fifo and state.synthesize_input is not None:
eprint('`--synthesize-input` requires `--fifo`')
if big_data[start_layer] and state.synthesize_input is not None:
eprint('`--synthesize-input` requires `data_format: HWC`')
if fifo:
if start_layer != 0:
eprint('`--start_layer` must be 0 when using a FIFO.')
if input_chan[start_layer] > 16 \
or big_data[start_layer] and input_chan[start_layer] > 4:
eprint('Using the FIFO is restricted to a maximum of 4 input channels (CHW) or '
f'16 channels (HWC); this input is using {input_chan[start_layer]} '
'channels.')
if big_data[start_layer] and processor_map[start_layer] & ~0x0001000100010001 != 0 \
or not big_data[start_layer] \
and processor_map[start_layer] & ~0x000f000f000f000f != 0:
eprint('The FIFO is restricted to processors 0, 16, 32, 48 (CHW) or '
'0-3, 16-19, 32-35, 48-51 (HWC).')
if fast_fifo:
if big_data[start_layer] and input_chan[start_layer] > 1:
eprint('Fast FIFO supports only a single CHW input channel; '
f'this test is using {input_chan[start_layer]} channels.')
elif not big_data[start_layer] and input_chan[start_layer] > 4:
eprint('Fast FIFO supports up to four HWC input channels; '
f'this test is using {input_chan[start_layer]} channels.')
if processor_map[start_layer] != 1 and processor_map[start_layer] & 0x0e == 0:
fifo_group = False
if output_width[start_layer] != 8:
eprint('Single-layer fast FIFO setup requires output width of 8.')
if operator[start_layer] == op.NONE:
eprint('Fast FIFO requires a convolution operation in the first layer.')
elif streaming[start_layer] and not allow_streaming:
eprint('Streaming in the first layer requires use of a FIFO.')
if any(streaming) and start_layer != 0:
eprint('`--start_layer` must be 0 when using streaming.')
for ll in range(min(tc.dev.MAX_STREAM_LAYERS + 1, layers)):
if next_sequence[ll] != -1 and next_sequence[ll] != ll + 1 and streaming[ll]:
eprint(f'{layer_pfx(ll)}`next_sequence` must be {layer_str(ll+1)} when '
f'using streaming. Currently configured: {layer_str(next_sequence[ll])}')
if tc.dev.EMULATE_1X1_STREAMING and kernel_size[ll] == [1, 1] \
and operator[ll] in [op.CONV2D, op.CONVTRANSPOSE2D] \
and (streaming[ll] or prev_sequence[ll] >= 0 and streaming[prev_sequence[ll]]):
nprint(f'{layer_pfx(ll)}Using 3x3 kernel hardware for layer with 1x1 kernel due '
'to streaming.')
# Create 3x3 weights from 1x1 weights and emulate using 3x3 kernels
weight33 = np.zeros((kernel[ll].shape[0], 3, 3), dtype=np.int64)
weight33[:, 1, 1] = kernel[ll][:, 0, 0]
kernel[ll] = weight33
assert padding[ll] == [0, 0]
padding[ll] = [1, 1]
hw_padding[ll] = [1, 1]
kernel_size[ll][0] = kernel_size[ll][1] = 3
if not tc.dev.SUPPORT_STREAM_NONPAD_FINAL and streaming[ll] \
and (next_sequence[ll] == -1 or not streaming[next_sequence[ll]]) \
and (padding[ll][0] == 0 or padding[ll][1] == 0):
eprint(f'{layer_pfx(ll)}Padding for the final streaming layer must not '
'be zero.')
if not tc.dev.SUPPORT_STREAMING_PASSTHROUGH \
and operator[ll] == op.NONE and streaming[ll]:
eprint(f'{layer_pfx(ll)}Passthrough operations are not supported for streaming '
'layers.')
mlator = state.mlator
if state.softmax and output_width[terminating_layer] == 8:
wprint('--softmax should only be used with `output_width: 32`.')
if fast_fifo and not riscv:
eprint('--fast-fifo requires --riscv')
if state.sleep and not riscv:
eprint('--deepsleep requires --riscv')
if oneshot and timer is not None:
eprint('--timer is not supported when using --one-shot')
if not tc.dev.SUPPORT_KERNEL_BYPASS \
and any(bypass[ll] for ll in range(first_layer_used, layers)):
eprint('Kernel bypass is not supported on this device.')
processor_map_0 = processor_map[start_layer]
if fast_fifo_quad:
processor_map[start_layer] = processor_map_0 << 48 | processor_map_0 << 32 \
| processor_map_0 << 16 | processor_map_0
for i, e in enumerate(quantization):
if e is None:
quantization[i] = 0 # Only in unused layers
binary_quantization = any(quantization[ll] == -1 for ll in range(first_layer_used, layers))
# Check we're not using binary weights on devices that don't support it
if binary_quantization and not tc.dev.SUPPORT_BINARY_WEIGHTS:
eprint('Binary weights (-1/+1) are not supported on this device.')
# Account for extra transparently inserted hardware layers
for ll in range(0, layers):
if avgpool_reset_layer[ll]:
sum_hw_layers += 1
hw_add_layers[ll] = sum_hw_layers
if repeat_layers * (final_layer + sum_hw_layers) > tc.dev.MAX_LAYERS:
rep = '' if repeat_layers == 1 else f'When repeating {repeat_layers} times, '
eprint(f'{rep}The adjusted layer count ({final_layer + sum_hw_layers}) '
f'exceeds the device maximum ({tc.dev.MAX_LAYERS}).')
hw_operator = operator.copy()
hw_input_dim = copy.deepcopy(input_dim)
hw_pooled_dim = copy.deepcopy(pooled_dim)
hw_output_dim = copy.deepcopy(output_dim)
hw_kernel_size = copy.deepcopy(kernel_size)
hw_kernel = copy.deepcopy(kernel)
hw_dilation = copy.deepcopy(dilation)
# Check that input channels are in separate memory instances if CHW (big) data format is
# used, and calculate input and output expansion
for ll in range(first_layer_used, layers):
if quantization[ll] == 1 and binary_quantization:
eprint(f'{layer_pfx(ll)}Cannot combine binary quantization with '
'1-bit quantization.')
if output_shift[ll] is None:
output_shift[ll] = 0 if not bypass[ll] else 7 # Set default
if output_shift[ll] < -15 or output_shift[ll] > 15:
implicit_shift = 8 - abs(quantization[ll]) if not bypass[ll] else 0
eprint(f'{layer_pfx(ll)}{abs(quantization[ll])}-bit weight '
f'quantization supports an output_shift range of [{-15 - implicit_shift}, '
f'+{15 - implicit_shift}]. The specified value of output_shift is '
f'{output_shift[ll] - implicit_shift} which exceeds the system limits.')
if big_data[ll]:
p = processor_map[ll] >> (ffs(processor_map[ll]) & ~(tc.dev.P_SHARED-1))
while p:
if popcount(p & (tc.dev.P_SHARED-1)) > 1:
eprint(f'{layer_pfx(ll)}CHW input format, but multiple '
'channels share the same memory instance. Modify the processor '
'map.')
p >>= tc.dev.P_SHARED
out_expand[ll] = (output_chan[ll] + tc.dev.MAX_PROC-1) // tc.dev.MAX_PROC
out_expand_thresh[ll] = (output_chan[ll] + out_expand[ll]-1) // out_expand[ll]
if output_chan[ll] > tc.dev.MAX_PROC:
out_expand_thresh[ll] = \
min((out_expand_thresh[ll] + tc.dev.P_SHARED-1) & ~(tc.dev.P_SHARED-1),
tc.dev.MAX_PROC)
in_expand[ll] = (input_chan[ll] + tc.dev.MAX_PROC-1) // tc.dev.MAX_PROC
if tcalc[ll] is None:
tcalc[ll] = rd_ahead[ll] and in_expand[ll] > 1 # Set default
in_expand_invol[ll] = (in_expand[ll] + 3) & ~3 if tcalc[ll] else in_expand[ll]
in_expand_thresh[ll] = (input_chan[ll] + in_expand[ll] - 1) // in_expand[ll]
if input_chan[ll] > tc.dev.MAX_PROC:
in_expand_thresh[ll] = \
min((in_expand_thresh[ll] + tc.dev.P_SHARED-1) & ~(tc.dev.P_SHARED-1),
tc.dev.MAX_PROC)
assert input_dim[ll][0] * input_dim[ll][1] * in_expand[ll] < tc.dev.FRAME_SIZE_MAX
# Data memory size check - 4 channels share one instance unless CHW format
in_size = input_dim[ll][0] * input_dim[ll][1] * in_expand[ll] * operands[ll] \
* (1 if big_data[ll] else 4)
if not streaming[ll] and in_size + in_offset[ll] > tc.dev.INSTANCE_WIDTH*16:
eprint(f'{layer_pfx(ll)}{1 if big_data[ll] else 4} '
f'channel{"s" if not big_data[ll] else ""}/word {input_dim[ll][0]}x'
f'{input_dim[ll][1]} input (size {in_size}) '
f'with input offset 0x{in_offset[ll]:04x} and expansion {in_expand[ll]}x '
f'exceeds data memory instance size of {tc.dev.INSTANCE_WIDTH*16}.')
if operator[ll] != op.CONV1D:
input_dim_str[ll] = f'{input_dim[ll][0]}x{input_dim[ll][1]}'
output_dim_str[ll] = f'{output_dim[ll][0]}x{output_dim[ll][1]}'
kernel_size_str[ll] = f'{kernel_size[ll][0]}x{kernel_size[ll][1]}'
pool_str[ll] = f'{pool[ll][0]}x{pool[ll][1]}' \
if pool[ll][0] > 1 or pool[ll][1] > 1 else '0x0'
padding_str[ll] = f'{padding[ll][0]}/{padding[ll][1]}'
pool_stride_str[ll] = f'{pool_stride[ll][0]}/{pool_stride[ll][1]}'
pool_dilation_str[ll] = f'{pool_dilation[ll][0]}/{pool_dilation[ll][1]}'
dilation_str[ll] = f'{dilation[ll][0]}/{dilation[ll][1]}'
stride_str[ll] = f'{stride[ll][0]}/{stride[ll][1]}'
else:
input_dim_str[ll] = f'{input_dim[ll][0]}'
output_dim_str[ll] = f'{output_dim[ll][0]}'
kernel_size_str[ll] = f'{kernel_size[ll][0]}'
pool_str[ll] = f'{pool[ll][0]}' \
if pool[ll][0] > 1 or pool[ll][1] > 1 else '0'
padding_str[ll] = f'{padding[ll][0]}'
pool_stride_str[ll] = f'{pool_stride[ll][0]}'
pool_dilation_str[ll] = f'{pool_dilation[ll][0]}'
dilation_str[ll] = f'{dilation[ll][0]}'
stride_str[ll] = f'{stride[ll][0]}'
if operands[ll] > 1:
eprint(f'{layer_pfx(ll)}Element-wise operations cannot be '
'combined with Conv1d.')
if not tc.dev.SUPPORT_MULTIPASS_PADDED_CONV1D and padding[ll][0] > 0 \
and in_expand[ll] > 1:
eprint(f'{layer_pfx(ll)}This device does not support padded Conv1d '
'with input expansion > 1.', error=not state.ignore_hw_limits)
if operator[ll] == op.NONE and next_sequence[ll] != -1 \
and operator[next_sequence[ll]] == op.NONE \
and not any(ll in c if c is not None else False for c in in_sequences):
nprint(f'{layer_pfx(ll)}Passthrough layer is followed by passthrough '
f'layer {layer_str(next_sequence[ll])}. '
'These layers could potentially be combined.')
if operands[ll] > 0 and pool[ll][0] == 1 and pool[ll][1] == 1 \
and (pool[next_sequence[ll]][0] > 1 or pool[next_sequence[ll]][1] > 1) \
and operands[next_sequence[ll]] == 1:
nprint('Use `pool_first: False` to combine element-wise and pooling layers '
'where pooling is executed after the element-wise operation.')
if not pool_first[ll] and operands[ll] > tc.dev.MAX_POOL_LAST_ELEMENTS \
and (pool[ll][0] > 1 or pool[ll][1] > 1):
eprint(f'"pool last" supports a maximum of {tc.dev.MAX_POOL_LAST_ELEMENTS} '
'element-wise operands on this device.')
if dilation[ll][0] > 1:
if operator[ll] != op.CONV1D:
eprint(f'{layer_pfx(ll)}`dilation` > 1 is supported for Conv1d only.')
if kernel_size[ll][0] == 1:
eprint(f'{layer_pfx(ll)}Kernel length must be greater than 1 to use '
'`dilation` > 1.')
if (kernel_size[ll][0] - 1) * dilation[ll][0] < 9:
# Stretch existing kernel if we can
# 0 1 2 --> 0 X X X 1 X X X 2
kzeros = []
for s in range(1, kernel_size[ll][0]):
kzeros += [s] * (dilation[ll][0] - 1)
k = np.insert(kernel[ll], kzeros, 0, axis=1)
hw_kernel[ll] = k
hw_kernel_size[ll] = [k.shape[1], 1]
elif kernel_size[ll][0] <= tc.dev.MAX_DILATION_1D_KERNEL:
# Use Conv2d
if pool[ll][0] != 1:
eprint(f'{layer_pfx(ll)}Pooling must be 1 to use `dilation` > 4.')
if padding[ll][0] > tc.dev.MAX_DILATION_1D_PAD:
eprint(f'{layer_pfx(ll)}Padding must be '
f'{tc.dev.MAX_DILATION_1D_PAD} or smaller to use `dilation` > 4.')
if operands[ll] != 1:
eprint(f'{layer_pfx(ll)}Operands must be 1 to use `dilation` > 4.')
if bypass[ll] or flatten[ll] or rd_ahead[ll] or streaming[ll]:
eprint(f'{layer_pfx(ll)}`bypass`, `flatten`, `rd_ahead`, '
'`streaming` must be False to use `dilation` > 4.')
if dilation[ll][0] > tc.dev.MAX_DILATION_1D:
eprint(f'{layer_pfx(ll)}`dilation` must be '
f'{tc.dev.MAX_DILATION_1D} or smaller for Conv1d operations.')
nprint(f'{layer_pfx(ll)}Using Conv2d hardware for dilated Conv1d.')
# Use the Conv1d hardware with 1 pad on 'dilation' columns using 3x3 kernels
hw_operator[ll] = op.CONV2D
hw_input_dim[ll][0] = (input_dim[ll][0] + dilation[ll][0] - 1) \
// dilation[ll][0]
hw_input_dim[ll][1] = dilation[ll][0]
hw_pooled_dim[ll] = hw_input_dim[ll]
hw_output_dim[ll] = hw_pooled_dim[ll]
hw_padding[ll] = [1, 1]
hw_kernel_size[ll] = [3, 3]
hw_dilation[ll] = [1, 1]
# 2D output size is equal to the 2D input size since the pad is fixed to 1.
# Subtract the original output dimensions to calculate the overage.
out_pad[ll] = hw_input_dim[ll][0] * hw_input_dim[ll][1] - output_dim[ll][0]
# Create 3x3 kernel from 3x1 kernel -- move original into center column
k = np.insert(kernel[ll].reshape(output_chan[ll],
input_chan[ll] // conv_groups[ll],
kernel_size[ll][0], -1),
[0, 1], 0, axis=3)
if kernel_size[ll][0] == 2:
k = np.insert(k, 0, 0, axis=2) # Insert at top - throw away the padding
elif kernel_size[ll][0] == 1:
k = np.insert(k, [0, 1], 0, axis=2) # Use center
else: # 3
out_ignore[ll] = 4 * dilation[ll][0] * out_expand[ll]
assert k.shape[2] == k.shape[3] == 3
hw_kernel[ll] = k.reshape(-1, k.shape[2], k.shape[3])
if out_offset[ll] < out_ignore[ll]:
eprint(f'{layer_pfx(ll)}`out_offset` used with dilation of '
f'{dilation[ll][0]} must be at least {out_ignore[ll]:04x}.')
else:
eprint(f'{layer_pfx(ll)}Kernel length must be '
f'{tc.dev.MAX_DILATION_1D_KERNEL} or smaller to use `dilation` of '
f'{dilation[ll][0]}.')
out_size = (output_dim[ll][0] * output_dim[ll][1] + out_pad[ll]) * out_expand[ll] \
* 4 * output_width[ll] // 8
if (not streaming[ll] or ll == terminating_layer) \
and out_size + out_offset[ll] > tc.dev.INSTANCE_WIDTH*16:
eprint(f'{layer_pfx(ll)}HWC (4 channels/word) '
f'{output_width[ll]}-bit {output_dim[ll][0]}x'
f'{output_dim[ll][1]} output (size {out_size}) '
f'with output offset 0x{out_offset[ll]:04x} and expansion '
f'{out_expand[ll]}x '
f'exceeds data memory instance size of {tc.dev.INSTANCE_WIDTH*16}.')
if hw_operator[ll] == op.NONE:
if activation[ll] is not None:
eprint(f'{layer_pfx(ll)}Pass-through layers must not use activation.')
if padding[ll][0] != 0 or padding[ll][1] != 0:
eprint(f'{layer_pfx(ll)}Padding must be zero for passthrough layers.')
if output_shift[ll] != 0 and output_shift[ll] is not None:
eprint(f'{layer_pfx(ll)}`output_shift` must be zero for passthrough '
'layers.')
if (pool[ll][0] > 1 or pool[ll][1] > 1) \
and in_expand[ll] > tc.dev.MAX_POOL_PASSES \
and (hw_pooled_dim[ll][0] > 1 or hw_pooled_dim[ll][1] > 1):
eprint(f'{layer_pfx(ll)}pooling in passthrough layer uses '
f'{in_expand[ll]} {plural(in_expand[ll], "pass", "es")}, '
f'which exceeds the maximum of {tc.dev.MAX_POOL_PASSES} '
'on this device.')
tram_max[ll] = 1
else:
if hw_operator[ll] == op.CONVTRANSPOSE2D:
# Flip padding around to match PyTorch conventions for ConvTranspose2d
hw_padding[ll] = (
hw_dilation[ll][0] * (hw_kernel_size[ll][0] - 1) - hw_padding[ll][0],
hw_dilation[ll][1] * (hw_kernel_size[ll][1] - 1) - hw_padding[ll][1]
)
if hw_padding[ll][0] not in tc.dev.SUPPORTED_X2D_PADS \
or hw_padding[ll][1] not in tc.dev.SUPPORTED_X2D_PADS:
eprint(f'{layer_pfx(ll)}The selected padding ({padding[ll]}) for '
'ConvTranspose2d is not supported on this device.')
if output_padding[ll][0] not in tc.dev.SUPPORTED_X2D_OUTPUT_PADS \
or output_padding[ll][1] not in tc.dev.SUPPORTED_X2D_OUTPUT_PADS:
eprint(f'{layer_pfx(ll)}The selected output padding '
f'({output_padding[ll]}) for ConvTranspose2d is not supported '
'on this device.')
tram_max[ll] = max(0, (hw_pooled_dim[ll][1] - 1) * stride[ll][1] + 1
+ output_padding[ll][1] + 2 * hw_padding[ll][1]
- hw_kernel_size[ll][1]) + 1
else:
tram_max[ll] = max(0, hw_pooled_dim[ll][1] + 2 * hw_padding[ll][1]
- hw_kernel_size[ll][1]) + 1
if hw_operator[ll] != op.CONVTRANSPOSE2D and (output_padding[ll][0] != 0
or output_padding[ll][1] != 0):
eprint(f'{layer_pfx(ll)}Output padding must be 0 for this operator.')
if input_chan[ll] % conv_groups[ll] != 0 or output_chan[ll] % conv_groups[ll] != 0:
eprint(f'{layer_pfx(ll)}convolution groups ({conv_groups[ll]}) does not '
f'divide the input channels ({input_chan[ll]}) or '
f'output channels ({output_chan[ll]}).')
if flatten[ll] and hw_operator[ll] == op.NONE:
eprint(f'{layer_pfx(ll)}`flatten` is not compatible with passthrough '
'layers.')
if flatten[ll] and (pool[ll][0] > 1 or pool[ll][1] > 1):
eprint(f'{layer_pfx(ll)}`flatten` is not compatible with pooling.')
if flatten[ll] and streaming[ll]:
eprint(f'{layer_pfx(ll)}`flatten` is not compatible with streaming.')
if conv_groups[ll] > 1:
if not tc.dev.SUPPORT_DEPTHWISE:
eprint(f'{layer_pfx(ll)}Convolution groups ({conv_groups[ll]}) > 1 are not '
f' supported on this device.')
if conv_groups[ll] != input_chan[ll] or conv_groups[ll] != output_chan[ll]:
eprint(f'{layer_pfx(ll)}Convolution groups ({conv_groups[ll]}) must be equal '
f'to the number of input channels ({input_chan[ll]}), and output '
f'channels ({output_chan[ll]}) must be equal to input channels.')
if flatten[ll]:
eprint(f'{layer_pfx(ll)}Convolution groups ({conv_groups[ll]}) > 1 are not '
'supported when flattening.')
if bias_group_map[ll] is not None:
eprint(f'{layer_pfx(ll)}`bias_group` is not supported for depth-wise layers.')
# if output_width[ll] != 8:
# eprint(f'{layer_pfx(ll)}convolution groups ({conv_groups[ll]}) > 1 are not'
# f' supported when using `wide` output.')
if input_skip[ll] != 0 and not tc.dev.SUPPORT_MULTIPASS_STRIDE:
eprint(f'{layer_pfx(ll)}`read_gap` must be 0 for this device.')
if input_skip[ll] != 0 and in_expand[ll] > 1 \
and not tc.dev.SUPPORT_MULTIPASS_READ_STRIDE:
eprint(f'{layer_pfx(ll)}`read_gap` must be 0 when using more than 64 channels or '
'multi-pass on this device.')
# Conv1d pool_dilation
if pool_dilation[ll][0] < 1 or pool_dilation[ll][1] < 1 \
or pool_dilation[ll][0] > tc.dev.MAX_POOL_DILATION \
or pool_dilation[ll][1] > tc.dev.MAX_POOL_DILATION:
eprint(f'{layer_pfx(ll)}`pool_dilation` values must be 1 or greater, and '
f'{tc.dev.MAX_POOL_DILATION} or smaller on this device.')
if in_sequences[ll] is not None:
or_map = 0
and_map = ~or_map
for i, lt in enumerate(in_sequences[ll]):
if lt == -1:
emap = processor_map[0]
elif lt == -2:
# pylint: disable=unsubscriptable-object
emap = data_buffer_cfg[0]['proc']
else:
emap = output_processor_map[lt]
or_map |= emap
and_map &= emap
if or_map != processor_map[ll]:
wprint(f'{layer_pfx(ll)}The combination of the `in_sequences` '
'`output_processor_map`s does not combine to `processor_map`.')
if and_map != processor_map[ll]:
# Concatenate channels sequentially
if and_map != 0:
wprint(f'{layer_pfx(ll)}Channel concatenation has `output_processor_map` '
f'overlap for `in_sequences` {in_sequences[ll]}.')
for i, lt in enumerate(in_sequences[ll]):
if lt != -1 and in_offset[ll] != out_offset[lt]:
wprint(f'{layer_pfx(ll)}`in_offset` (0x{in_offset[ll]:04x}, for '
f'input #{i}) does not match `out_offset` '
f'(0x{out_offset[lt]:04x}) for layer {layer_str(lt)}.')
else:
# Channel-wise concatenation or element-wise or interleaved concatenation
offs = 0
for i, lt in enumerate(in_sequences[ll]):
if lt not in [-1, -2] \
and in_offset[ll] + 4 * i != out_offset[lt] + 4 * input_crop[ll][0] \
and in_offset[ll] + offs != out_offset[lt]:
wprint(f'{layer_pfx(ll)}`in_offset` (0x{in_offset[ll]:04x}, for '
f'input #{i}) does not match `out_offset` '
f'(0x{out_offset[lt]:04x}) for layer {layer_str(lt)}.')
offs += 4 * output_dim[lt][0] * output_dim[lt][1]
if operands[ll] == 1: # cat
if write_gap[ll] == 0:
min_proc = -1
max_proc = -1
for lt in in_sequences[ll]:
first_proc = ffs(processor_map[0]) if lt == -1 \
else ffs(output_processor_map[lt])
last_proc = fls(processor_map[0]) if lt == -1 \
else fls(output_processor_map[lt])
if first_proc < min_proc:
wprint(f'{layer_pfx(ll)}In `in_sequences` {in_sequences[ll]}, '
'an earlier layer in the sequence uses a higher first '
f'processor ({min_proc}) than layer {layer_str(lt)} which '
f'uses processor {first_proc}.')
if last_proc < max_proc:
wprint(f'{layer_pfx(ll)}In `in_sequences` {in_sequences[ll]}, '
'an earlier layer in the sequence uses a higher last '
f'processor ({max_proc}) than layer {layer_str(lt)} which '
f'uses processor {last_proc}.')
min_proc = first_proc
max_proc = last_proc
else: # eltwise
eltwise_proc_map = 0
for lt in in_sequences[ll]:
emap = processor_map[0] if lt == -1 else output_processor_map[lt]
if eltwise_proc_map not in (0, emap):
eprint(f'{layer_pfx(ll)}In `in_sequences` {in_sequences[ll]}, '
'an earlier layer in the sequence uses a different output '
f'processor map (0x{eltwise_proc_map:016x}) than layer '
f'{layer_str(lt)} which uses 0x{emap:016x}.')
eltwise_proc_map = emap
# Merge the output of all processors of all input sequence members
emap = 0
for lt in in_sequences[ll]:
if lt == -1:
emap |= processor_map[0]
elif lt == -2:
# pylint: disable=unsubscriptable-object
emap |= data_buffer_cfg[0]['proc']
else:
emap |= output_processor_map[lt]
# Check that all out input processors have data from somewhere in the merged map
if processor_map[ll] & emap != processor_map[ll]:
wprint(f'{layer_pfx(ll)}The processor map {processor_map[ll]:016x} specifies '
'processors that have no data from any of the input sequences '
f'{in_sequences[ll]}.')
if output_width[ll] != 8 and hw_operator[ll] == op.NONE:
eprint(f'{layer_pfx(ll)}'
'The passthrough operator requires an output width of 8.')
# Deduplicate kernels
# Do this here since by now all modifications to the kernels have happened
kernel_ptrs: List[int] = [] # Indirection for hw_kernel
bias_ptrs: List[int] = []
if state.deduplicate_weights:
kernel_ptrs, hw_kernel = kdedup.deduplicate(
hw_kernel,
layers,
quantization,
processor_map,
kind='kernels',
)
bias_ptrs, bias = kdedup.deduplicate(
bias,
layers,
quantization,
processor_map,
kind='bias',
)
else:
kernel_ptrs = list(range(len(hw_kernel)))
bias_ptrs = list(range(len(bias)))
state.weights = hw_kernel
state.bias = bias
# Create comment of the form "k1_b0-1x32x32b_2x2s2p14-..."
test_name = prefix
if not embedded_code:
for ll in range(first_layer_used, layers):
test_name += f'-{input_chan[ll]}x{input_dim_str[ll]}' \
f'{"b" if big_data[ll] else "l"}' \
f'{"f" if flatten[ll] else ""}_' \
+ ("avg" if pool_average[ll]
and (pool[ll][0] > 1 or pool[ll][1] > 1) else "") \
+ ("max" if not pool_average[ll]
and (pool[ll][0] > 1 or pool[ll][1] > 1) else "") \
+ f'{pool_str[ll]}s{pool_stride[ll][0]}' \
f'p{padding[ll][0]}' \
f'm{output_chan[ll]}'
if activation[ll] == op.ACT_RELU:
test_name += "_relu"
elif activation[ll] == op.ACT_ABS:
test_name += "_abs"
if repeat_layers > 1:
test_name += f'_repeat{repeat_layers}'
MAX_PATH = 255
if len(test_name) + len(base_directory) > MAX_PATH - 10:
h = hashlib.md5(test_name.encode()).hexdigest() # Immutable hash from test name
cutoff = MAX_PATH - len(test_name) - len(base_directory) - len(h) - 10
test_name = test_name[:cutoff] + '-' + h
print(f'{test_name}...')
try:
target_dir = os.path.join(base_directory, test_name)
os.makedirs(target_dir, exist_ok=False)
except OSError:
if not overwrite:
eprint('The target folder', target_dir, 'exists. Use --overwrite to proceed.')
else:
nprint('--overwrite specified, writing to', target_dir, 'even though it exists.')
# Redirect stdout?
if log:
state.output_is_console = False
sys.stdout = open(
os.path.join(base_directory, test_name, log_filename),
mode='w',
encoding='utf-8',
)
print(f'{" ".join(str(x) for x in sys.argv)}')
print(f'{tc.dev.partnum}\n')
print(f'{test_name}')
if block_mode:
filename = state.input_filename + '.mem'
else:
filename = c_filename + ('_riscv' if riscv else '') + '.c'
if not block_mode and (embedded_code or compact_data):
sampledata_header = \
open(
os.path.join(base_directory, test_name, state.sample_filename),
mode='w',
encoding='utf-8',
)
sampledata_header.write('// This file was @generated automatically\n\n')
if state.generate_kat and state.result_filename is not None:
sampleoutput_header = \
open(
os.path.join(base_directory, test_name, state.result_filename),
mode='w',
encoding='utf-8',
)
sampleoutput_header.write('// This file was @generated automatically\n\n')
else:
sampleoutput_header = None
else:
sampledata_header = sampleoutput_header = None
if not block_mode and not state.rtl_preload_weights:
weight_header = \
open(
os.path.join(base_directory, test_name, weight_filename),
mode='w',
encoding='utf-8',
)
weight_header.write('// This file was @generated automatically\n\n')
else:
weight_header = None
# Calculate the groups needed, and groups and processors used overall
processors_used = 0
group_map = [None] * layers
broadcast_mode = [False] * layers
emulate_eltwise = [False] * layers
for ll in range(first_layer_used, layers):
bits = processor_map[ll]
processors_used |= bits
fl = ' (before flattening)' if flatten[ll] else ''
if input_chan[ll] > tc.dev.MAX_CHANNELS:
eprint(f'{layer_pfx(ll)}Configured for {input_chan[ll]} input channels{fl}, which '
f'exceeds the system maximum of {tc.dev.MAX_CHANNELS}.')
if output_chan[ll] > tc.dev.MAX_CHANNELS:
eprint(f'{layer_pfx(ll)}Configured for {output_chan[ll]} output channels, which '
f'exceeds the system maximum of {tc.dev.MAX_CHANNELS}.')
if (ll != start_layer or not fast_fifo_quad) \
and popcount(processor_map[ll]) != in_expand_thresh[ll]:
eprint(f'{layer_pfx(ll)}{input_chan[ll]} input '
f'{plural(input_chan[ll], "channel")}{fl} '
f'using {in_expand[ll]} {plural(in_expand[ll], "pass", "es")}, '
f'and {operands[ll]} {plural(operands[ll], "operand")} '
f'({in_expand_thresh[ll]} processors '
f'per pass), but the enabled processor map 0x{processor_map[ll]:016x} '
f'has {popcount(processor_map[ll])} bits instead of the '
f'expected number of {in_expand_thresh[ll]}.')
if ll == start_layer and fast_fifo_quad \
and popcount(processor_map_0) != in_expand_thresh[ll]:
eprint(f'{layer_pfx(ll)}{input_chan[ll]} input '
f'{plural(input_chan[ll], "channel")}{fl} '
f'using {in_expand[ll]} {plural(in_expand[ll], "pass", "es")} '
f'({in_expand_thresh[ll]} processors per pass), but the '
f'enabled processor map 0x{processor_map[ll]:016x} '
f'has {popcount(processor_map[ll])} bits instead of the '
f'expected number of {in_expand_thresh[ll]}.')
if popcount(output_processor_map[ll]) != out_expand_thresh[ll]:
eprint(f'{layer_pfx(ll)}{output_chan[ll]} output '
f'{plural(output_chan[ll], "channel")} using {out_expand[ll]} '
f'{plural(out_expand[ll], "pass", "es")} '
f'({out_expand_thresh[ll]} processors per pass), but the '
f'processor output map 0x{output_processor_map[ll]:016x} '
f'has {popcount(output_processor_map[ll])} bits instead of the '
f'expected number of {out_expand_thresh[ll]}.')
this_map = []
for group in range(tc.dev.P_NUMGROUPS):
if (processor_map[ll] >> group*tc.dev.P_NUMPRO) % 2**tc.dev.P_NUMPRO:
this_map.append(group)
group_map[ll] = this_map
# Ensure input and output map are the same for passthrough layers
if hw_operator[ll] == op.NONE:
for group in range(tc.dev.P_NUMGROUPS):
in_pro = 2**popcount(
(processor_map[ll] >> group*tc.dev.P_NUMPRO) % 2**tc.dev.P_NUMPRO
) - 1
out_pro = (output_processor_map[ll] >> group*tc.dev.P_NUMPRO) \
% 2**tc.dev.P_NUMPRO
if out_pro != 0:
out_pro >>= ffs(out_pro)
if out_pro != in_pro:
eprint(f'{layer_pfx(ll)}The output processors for a pass-through layer '
'must be a packed version of the input processors for each x16. '
f'Configured are: input {processor_map[ll]:016x}, output '
f'{output_processor_map[ll]:016x}.')
# Ensure byte positions are the same in the input and output map for
# depthwise convolutions
if conv_groups[ll] > 1:
if ffs(output_processor_map[ll]) % tc.dev.P_SHARED != 0:
eprint(f'{layer_pfx(ll)}Output processors for depth-wise convolutions '
'must be aligned to a multiple of 4. Configured for this layer: '
f'{output_processor_map[ll]:016x}.')
if ffs(processor_map[ll]) % tc.dev.P_SHARED != 0 \
and (processor_map[ll] >> ffs(processor_map[ll])) // 2**tc.dev.P_NUMPRO > 0:
eprint(f'{layer_pfx(ll)}When spanning groups for depth-wise convolutions, '
'processors must be aligned to a multiple of 4. Configured for this '
f'layer: {processor_map[ll]:016x}.')
if processor_map[ll] == output_processor_map[ll]:
broadcast_mode[ll] = True
elif state.energy_warning:
nprint(f'{layer_pfx(ll)}depth-wise convolution moves data across processors. '
f'This has a performance impact. Input 0x{processor_map[ll]:016x}, '
f'output 0x{output_processor_map[ll]:016x}.')
# Block certain element-wise operations when not using passthrough mode
if tc.dev.EMULATE_ELTWISE_MP and operands[ll] > 1 and in_expand[ll] > 1 \
and operands[ll] * in_expand[ll] != operands[ll] + in_expand[ll]:
if hw_operator[ll] != op.NONE or pool[ll][0] > 1 or pool[ll][1] > 1 \
or pool_stride[ll][0] > 1 or pool_stride[ll][1] > 1:
eprint(f'{layer_pfx(ll)}The element-wise operation exceeds a multi-pass of 2 '
'and therefore does not support pooling or convolution.')
emulate_eltwise[ll] = True
# Warn if hidden layers use channel count that is not divisible by 4
if ll != start_layer and input_chan[ll] % 4 != 0 and state.energy_warning:
nprint(f'{layer_pfx(ll)}The input channel count ({input_chan[ll]}) is not '
'a multiple of 4. Best energy performance is achieved with multiples of 4.')
groups_used = []
for group in range(tc.dev.P_NUMGROUPS):
if ((processors_used |
output_processor_map[final_layer]) >> group*tc.dev.P_NUMPRO) % 2**tc.dev.P_NUMPRO: