-
Notifications
You must be signed in to change notification settings - Fork 1
/
radio_AGC_LMS_IIR_SAA.asm
1744 lines (1507 loc) · 69.4 KB
/
radio_AGC_LMS_IIR_SAA.asm
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#include "spi.h"
#include "nordic_regs.h"
#define LED_BLINK 0x0010 // blink LED
#define NEXT_CHANNEL 0x3f00 // Intan command CONVERT(63) -- convert next channel
#define SHIFT_BITS 1 // SPORT may configure for more than 16-bit read/write...compensate
#define NEXT_CHANNEL_SHIFTED 0x7e00 // =NEXT_CHANNEL << SHIFT_BITS
#define SAMPLE_SHIFT 5 // =SHIFT_BITS+4, to get sample down to 12-bits.
#define COMMAND_SHIFT 9 // channel << 8=commands, command<<1=17 bit SPORT write
#define READ_40 0xe800 // sending 11_101000_00000000, get 00000000_01001001 (0x0049)
#define READ_41 0xe900 // sending 11_101001_00000000, get 00000000_01001110 (0x004e)
#define READ_42 0xea00 // sending 11_101010_00000000, get 00000000_01010100 (0x0054)
#define READ_43 0xeb00 // sending 11_101011_00000000, get 00000000_01000001 (0x0041)
#define READ_44 0xec00 // sending 11_101100_00000000, get 00000000_01001110 (0x004e)
// Intan register setup values
#define REG0 0x80de // get back 0xffde
#define REG1 0x8102 // get back 0xff02
#define REG2 0x8204 // get back 0xff04
#define REG3 0x8300 // get back 0xff00
#define REG4 0x8480 // get back 0xff80 - no DSP, offset binary
#define REG4_DSP_UNSIGNED 0x8494 // get back 0xff94 - DSP, offset binary
#define REG4_DSP_SIGNED 0x84D4 // get back 0xffD4 - DSP, signed output
#define REG4_SIGNED 0x84C4 // get back 0xffC4 - no DSP, signed output
#define REG4_DSP_SIGNED_1Hz 0x84DC // get back 0xffDC - DSP 1Hz, signed output
#define REG4_DSP_SIGNED_300Hz 0x84D4 // get back 0xffD4 - DSP 300Hz, signed output
#define REG5 0x8500 // get back 0xff00
#define REG6 0x8600 // get back 0xff00
#define REG7 0x8700 // get back 0xff00
//----------- Hardware bandpass settings ------------
// Intan REG8-13, configured for [250Hz, 10kHz]
#define REG8 0x8811 // get back 0xff11.
#define REG9 0x8900 // get back 0xff00
#define REG10 0x8a10 // get back 0xff10.
#define REG11 0x8b00 // get back 0xff00
#define REG12 0x8c11 // get back 0xff11.
#define REG13 0x8d00 // get back 0xff00
// Intan REG8-13, configured for [250Hz, 7.5kHz]
//#define REG8 0x8816 // get back 0xff16
//#define REG9 0x8900 // get back 0xff00
//#define REG10 0x8a17 // get back 0xff17
//#define REG11 0x8b00 // get back 0xff00
//#define REG12 0x8c15 // get back 0xff15
//#define REG13 0x8d00 // get back 0xff00
// Intan REG8-13, configured for [1Hz, 10kHz]
//#define REG8 0x8811 // get back 0xff11
//#define REG9 0x8900 // get back 0xff00
//#define REG10 0x8a10 // get back 0xff10
//#define REG11 0x8b00 // get back 0xff00
//#define REG12 0x8c2c // get back 0xff2c
//#define REG13 0x8d06 // get back 0xff06
//---------------------------------------------------
#define REG14 0x8eff // get back 0xffff
#define REG15 0x8fff // get back 0xffff
#define REG16 0x90ff // get back 0xffff
#define REG17 0x91ff // get back 0xffff
#define CALIB 0x5500 // get back 0x8000
#define CONVERT0 0x0000
.align 8 // call is a 5-cycle latency if target is aligned.
_get_asm:
/* Sampling thread:
Reads in data, apply offset removal/high-pass, AGC gain, LMS, and IIR.
Clobbers almost all registers. But it's ok.
*/
// Reset p0 to point to SPORT0_RX.
p0 = [FP - FP_SPORT0_RX];
/* When this thread first starts running, [FP_CHAN] = 30. This reg holds the
current channel and is incremented before actually reading the samples.
Therefore, after incrementing, this value will be the channel number corresponding
to that being read out on MOSI.
The SPORT pipeline is setup such that, when _get_asm is first executed, we have:
FIFO: CONVERT(3)
MOSI: CONVERT(2)
ADC: CONVERT(1)
MISO: CONVERT(0) result.
Thus the initial [FP_CHAN]=30 value is incremented to 31, corresponding with MISO result.
After reading MISO, CONVERT(3) will need to be loaded, thus the channel to be loaded is
FP_CHAN+4 (after FP_CHAN is already incremented).
*/
wait_samples_main:
r3 = w[p0 + (SPORT1_STAT - SPORT0_RX)];
cc = bittst(r3, 0);
if !cc jump wait_samples_main;
// we got samples back from MISO, new CONVERT command transmitted!
// now increment the channel, and queue up new command
r6 = [FP - FP_CHAN];
r6 += 1;
bitclr(r6, 5); // roll back to 0 if it's 32
[FP - FP_CHAN] = r6;
/* Read in the samples.
Note the difference in channel organization between this version and Tim's headstages.
Tim's board, amplifiers are connected, from left-to-right, looking from top of board as:
SPORT0_PRI = Ch0-31
SPORT0_SEC = Ch32-63
SPORT1_PRI = Ch64-95
SPORT1_SEC = Ch96-127
This board:
SPORT1_SEC = Ch0-31
SPORT1_PRI = Ch32-63
SPORT0_SEC = Ch64-95
SPORT0_PRI = Ch96-127
To minimize firmware changes, the order of reading SPORT ports have been changed -- SPORT0 switch with
SPORT1. r0 switched with r1. This way the reg names in the rest of signal chain stays the same.
The electrode to channel configurations between the two boards are not the same. So sorted templates
are not interchangeable.
*/
r2.h = 0xFFFF;
r2.l = 0x0000;
r1 = [p0 + (SPORT1_RX - SPORT0_RX)]; // SPORT1-primary: Ch32-63
r0 = [p0 + (SPORT1_RX - SPORT0_RX)]; // SPORT1-sec: Ch0-31
r1 <<= 15; // Shift 15 bits because I read in 17 bits in lower 2 bytes
r0 >>= SHIFT_BITS;
r1 = r1 & r2;
r2 = r0 + r1; // r2 = Ch32, Ch0 (lo, hi). 16-bits samples
r5 = [i0++]; // r5.l=0x7fff, r5.h=0x8000. i0 @2nd inte coefs after.
.align 8
// integrator stage - the incoming samples are alreayd in Q1.15 format.
[i2++] = r2 || r0 = [i1++]; // save new sample. i1,i2 @ integrator mean after
a0 = r2.l * r5.l, a1 = r2.h * r5.l || r1 = [i1++]; // a0=(Q31)sample. r1=integrated mean. i1@AGCgain
// r0=how much sample deviates from integrated mean. r6.l=0x7fff, r6.h=0x0640
r0.l = (a0 += r1.l * r5.h), r0.h = (a1 += r1.h * r5.h) || r6=[i0++]; // i0@AGC_target after
a0 = r1.l * r6.l, a1 = r1.h * r6.l; // a0=mean in Q31
// r2=updated mean. r5=AGC-gain, r7=AGC-target. i1@saturated AGCout, i0@AGC-scaler/enable.
r2.l = (a0 += r0.l * r6.h), r2.h = (a1 += r0.h * r6.h) || r5=[i1++] || r7=[i0++];
// AGC-stage
a0 = r0.l * r5.l, a1 = r0.h * r5.h || [i2++] = r2; //a0,a1=AGCgain*devFromMean. Save new mean. i2@AGCgain
/* Result in accumulator is Q1.31. Multiplication treated as (Q1.31)*(Q7.8). Q7.8 has
8 bits before the decimal point. The result should have 9 bits before the decimal point.
Therefore, we need to left shift the accumulator 8 more bits to position the actual decimal
point at accumulator's decimal point position.
This is because when we move the accumulator result to half dreg, we only take the highest
16-bits (A0.H, A1.H) in default mode (treat as signed fraction).
*/
a0 = a0 << 8;
a1 = a1 << 8;
r0.l = a0, r0.h = a1; // Move half reg, treat result as signe frac. Round to bit 16 and extract.
a0 = abs a0, a1 = abs a1; // Need to compare abs-scaled deviation to target.
// Signed-int mode for MAC. Result should be 0x7fff or 0x8000, tell us if target is smaller or not.
// r6.l = 16384 (0.5), r6.h = 1 (AGC-enable). i0@LMS w6 (for channel (c-7)%32)
r4.l = (a0 -= r7.l * r7.l), r4.h = (a1 -= r7.h * r7.h) (is) || r6 = [i0++];
/* Update AGC-gain (trick math here):
r5 = AGC-gain, which is in Q8.8 format. r6.l=0x4000=0.5 in Q1.15 format.
Turns out if we do
a0=r5.l*r6.l, r3.l=(a0-=0*0) (s2rnd)
Then the result in r3.l is the same value in Q8.8 format.
In the second line below, r4.l/h is either 0x7fff or 0x8000. So it acts as
either subtracting 2^-8 from the original Q8.8 gain's value; Or as adding
2^-7 to the original Q8.8 gain's value after extraction.
*/
a0 = r5.l * r6.l, a1 = r5.h * r6.l; // load a0 with scaled AGC-gain
r3.l = (a0 -= r4.l * r6.h), r3.h = (a1 -= r4.h * r6.h) (s2rnd); // within certain range gain doesnt change
.align 8
r3 = abs r3 (v) || r7 = [FP-FP_WEIGHTDECAY]; // r3=updated AGC gain; r7=1 if LMS
/* Start LMS, at this point:
i0 @ LMS w6, for channel (c-7)%32. Current channel is c (bank A1)
i1 @ saturated AGCout (bank W1)
i2 @ AGCgain (bank W1)
r0=AGC-gained sample
r1=integrated mean
r2=new integrator mean
r3=new AGCgain
r4=AGC-target comparison result
r5=old AGCgain
r6=AGC enable and coefs
r7=LMS enable/Weightdecay
m0 = -28 (moving back 7 LMS weights in A1)
m1 = -7*W1_STRIDE*2*4 (moving back 7 channels on same amplifier, in W1)
m2 = W1_STRIDE*2*4 (moving forward 1 channel on same amplifier, in W1)
m3 = 8 (jump forward two 32-bit words).
*/
// ------ Steps to skip LMS gracefully -----
// saturate the sample (r4), save the AGCgain (r3), i2@saturated AGCout
// move i1 to x0(n-1),
/* r4.l = (a0 = r0.l * r7.l), r4.h = (a1 = r0.h * r7.l) (is) || i1 += m3 || [i2++] = r3;
mnop || r2 = [i0++m3] || [i2++] = r4; // move i0 to w4, save sat samp (r4), i2@AGC/LMS-out
mnop || r2 = [i0++m3] || [i2++] = r0; // move i0 to w2, save AGC samp (r0), i2@x0(n-1)
r2 = [i0++m3]; // move i0 to w0
r2 = [i0++]; // move i0 to LPF1 b0
nop; nop; nop; nop; nop; nop; nop; nop; nop; nop;
nop; nop; nop; nop; nop; nop; nop; nop; nop; nop;
nop; nop;
*/
// -------------------------------------------
// saturate the sample (r4), save the AGCgain (r3),
// move i1 to saturated AGCout 7 channels before, i2@saturated AGCout
r4.l = (a0 = r0.l * r7.l), r4.h = (a1 = r0.h * r7.l) (is) || i1 += m1 || [i2++] = r3;
mnop || r1 = [i1++] || [i2++] = r4; // i1@AGCout of c-7, save sat sample (r4), i2@AGCout of c
mnop || r1 = [i1++m2] || r2 = [i0++]; // r1=AGCout(c-7), i1@AGCout(c-6), r2=LMS w6, i0@LMS w5
// LMS summation
a0 = r1.l * r2.l, a1 = r1.h * r2.h || r1 = [i1++m2] || r2 = [i0++];
a0+= r1.l * r2.l, a1+= r1.h * r2.h || r1 = [i1++m2] || r2 = [i0++];
a0+= r1.l * r2.l, a1+= r1.h * r2.h || r1 = [i1++m2] || r2 = [i0++];
a0+= r1.l * r2.l, a1+= r1.h * r2.h || r1 = [i1++m2] || r2 = [i0++];
a0+= r1.l * r2.l, a1+= r1.h * r2.h || r1 = [i1++m2] || r2 = [i0++];
a0+= r1.l * r2.l, a1+= r1.h * r2.h || r1 = [i1++m2] || r2 = [i0++];
// r1=AGCout(c-1), i1@AGCout(c), r2=LMS w0, i0@LPF b0
r6.l = (a0+= r1.l * r2.l), r6.h = (a1+= r1.h * r2.h) || r1 = [i1++m1] || r2 = [i0++m0];
// r6=summation result, move i1 to AGCout(c-7), move i0 to LMS w6
nop;
r0 = r0 -|- r6 (s) || [i2++] = r0 || r1 = [i1--];
// compute error (r0), save AGCout (r0), i2@x0(n-1), move i1 to sat-AGCout(c-7).
r6 = r0 >>> 15 (v,s) || r1 = [i1++m2] || r2 = [i0++];
// r6=sign(error), r1=sat-AGCout(c-7), i1@sat-AGCout(c-6), r2=LMS w6, i0@LMS w5
.align 8
a0 = r2.l * r7.h, a1 = r2.h * r7.h || nop || nop; // weight*LMS_enable
r5.l = (a0 += r1.l * r6.l), r5.h = (a1 += r1.h * r6.h) || r1 = [i1++m2] || r2 = [i0--];
//r5=new w6, r1=satAGCout(c-6), i1@satAGCout(c-5), r2=w5, i0@w6
a0 = r2.l * r7.h, a1 = r2.h * r7.h || [i0++m3] = r5;
// save w6 (r5), i0@w4
r5.l = (a0 += r1.l * r6.l), r5.h = (a1 += r1.h * r6.h) || r1 = [i1++m2] || r2 = [i0--];
// r1=satAGCout(c-5), i1@satAGCout(c-4), r2=w4, i0@w5
a0 = r2.l * r7.h, a1 = r2.h * r7.h || [i0++m3] = r5;
// save w5, i0@w3
r5.l = (a0 += r1.l * r6.l), r5.h = (a1 += r1.h * r6.h) || r1 = [i1++m2] || r2 = [i0--];
// r1=satAGCout(c-4), i1@satAGCout(c-3), r2=w3, i0@w4
a0 = r2.l * r7.h, a1 = r2.h * r7.h || [i0++m3] = r5;
// save w4, i0@w2
r5.l = (a0 += r1.l * r6.l), r5.h = (a1 += r1.h * r6.h) || r1 = [i1++m2] || r2 = [i0--];
// r1=satAGCout(c-3), i1@satAGCout(c-2), r2=w2, i0@w3
a0 = r2.l * r7.h, a1 = r2.h * r7.h || [i0++m3] = r5;
// save w3, i0@w1
r5.l = (a0 += r1.l * r6.l), r5.h = (a1 += r1.h * r6.h) || r1 = [i1++m2] || r2 = [i0--];
// r1=satAGCout(c-2), i1@satAGCout(c-1), r2=w1, i0@w2
a0 = r2.l * r7.h, a1 = r2.h * r7.h || [i0++m3] = r5;
// save w2, i0@w0
r5.l = (a0 += r1.l * r6.l), r5.h = (a1 += r1.h * r6.h) || r1 = [i1++m2] || r2 = [i0--];
// r1=satAGCout(c-1), i1@satAGCout(c), r2=w0, i0@w1
a0 = r2.l * r7.h, a1 = r2.h * r7.h || [i0++m3] = r5;
// save w1, i0@LPF1 b0
r5.l = (a0 += r1.l * r6.l), r5.h = (a1 += r1.h * r6.h) || r1 = [i1++m3] || r2 = [i0--];
// r1=satAGCout(c), i1@x0(n-1), r2=LPF1_b0, i0@w0
mnop || [i0++] = r5; // save w0, i0@LPF1 b0
.align 8
/* Start 2 back-to-back direct-form 1 biquads. At this point,
r0 = LMS sample
i0 @ LPF1 b0
i1 @ x0(n-1) of current channel.
i2 @ x0(n-1) of current channel.
If i1 and i2 are dereferenced (read out) in the same cycle, the processor will stall -- each of the
1k SRAM memory banks has only one port.
*/
r5 = [i0++] || r1 = [i1++]; // r5=b0(LPF), r1=x0(n-1); i0@b1(LPF), i1@x0(n-2)
a0 = r0.l * r5.l, a1 = r0.h * r5.h || r6 = [i0++] || r2 = [i1++]; // r6=b1(LPF), r2=x0(n-2)
a0 += r1.l * r6.l, a1 += r1.h * r6.h || r7 = [i0++] || r3 = [i1++]; // r7=a0(LPF), r3=y1(n-1)
a0 += r2.l * r5.l, a1 += r2.h * r5.h || r5 = [i0++] || r4 = [i1++]; // r5=a1(LPF), r4=y1(n-2)
a0 += r3.l * r7.l, a1 += r3.h * r7.h || [i2++] = r0; // update x0(n-1), i2@x0(n-2)
r0.l = (a0 += r4.l * r5.l), r0.h = (a1 += r4.h * r5.h) (s2rnd) || [i2++] = r1; // r0=y1(n), update x0(n-2)
// now i0@b0(HPF1), i1@y2(n-1). i2@y1(n-1)
r5 = [i0++] || [i2++] = r0; // r5=b0(HPF1), update y1(n-1); i0@b1(HPF1), i2@y1(n-2)
a0 = r0.l * r5.l, a1 = r0.h * r5.h || r6 = [i0++] || [i2++] = r3; // r6=b1(HPF1). update y1(n-2)
a0 += r3.l * r6.l, a1 += r3.h * r6.h || r7 = [i0++] || r1 = [i1++]; // r7=a0(HPF1). r1=y2(n-1)
a0 += r4.l * r5.l, a1 += r4.h * r5.h || r2 = [i1++]; // r2=y2(n-2)
a0 += r1.l * r7.l, a1 += r1.h * r7.h || r5 = [i0++]; // r5=a1(HPF1)
r0.l = (a0 += r2.l * r5.l), r0.h = (a1 += r2.h * r5.h) (s2rnd); // r0=y2(n)
// now i0@next ch integ1; i1@next sample; i2@y2(n-1)
[i2++] = r0; // save current y2(n) in y2(n-1)'s spot. i2 @ y2(n-2)
[i2++] = r1; // save current y2(n-1) in y2(n-2)'s spot. i2 @ next sample
[--sp] = r0; // save current result on the stack
// all pointers are ready for next two channels of this group.
// i0 @ next ch integ1; i1,i2 @ next sample.
//---------------------------------------------------------------------------------------
r2.h = 0XFFFF;
r2.l = 0x0000;
// Process the other two channels in this group. Pretty much identical as before.
r1 = [p0]; // SPORT0-primary: Ch96-127
r0 = [p0]; // SPORT0-sec: Ch64-95
r1 <<= 15;
r0 >>= SHIFT_BITS;
r1 = r1 & r2;
r2 = r0 + r1;
r5 = [i0++];
.align 8
// integrator stage
[i2++] = r2 || r0 = [i1++]; // save new sample. i1,i2 @ integrator mean after.
a0 = r2.l * r5.l, a1 = r2.h * r5.l || r1 = [i1++]; // a0=(Q15)sample. r1=integrated mean. i1@AGCgain
// r0=how much sample deviates from mean. r6.l=0x7fff, r6.h=0x0640
r0.l = (a0 += r1.l * r5.h), r0.h = (a1 += r1.h * r5.h) || r6=[i0++]; // i0@AGCTargert after
a0 = r1.l * r6.l, a1 = r1.h * r6.l; // a0=mean
// r2=updated mean. r5=AGCgain, r7=AGCTarget. i1@final-sample, i0@AGC-scaler/enable.
r2.l = (a0 += r0.l * r6.h), r2.h = (a1 += r0.h * r6.h) || r5=[i1++] || r7=[i0++];
// AGC-stage
a0 = r0.l * r5.l, a1 = r0.h * r5.h || [i2++] = r2; //a0,a1=AGCgain*devFromMean. Save new mean. i2@AGCgain
a0 = a0 << 8;
a1 = a1 << 8;
r0.l = a0, r0.h = a1; // Move half reg, treat result as signed frac. Round to bit 16 and extract.
a0 = abs a0, a1 = abs a1; // Compare abs-scaled deviation to target.
// signed-int mode MAC. Result should be 0x7fff or 0x8000 - tells if target is smaller or not.
// r6.l = 0x4000, r6.h=1 (AGC-enable). i0@integ_coef1
r4.l = (a0 -= r7.l * r7.l), r4.h = (a1 -= r7.h * r7.h) (is) || r6 = [i0++];
// update AGCgain
a0 = r5.l * r6.l, a1 = r5.h * r6.l; // convert AGCgain to Q1.15
r3.l = (a0 -= r4.l * r6.h), r3.h = (a1 -= r4.h * r6.h) (s2rnd);
.align 8
r3 = abs r3 (v) || r7 = [FP-FP_WEIGHTDECAY];
// ------ Steps to skip LMS gracefully -----
// saturate the sample (r4), save the AGCgain (r3), i2@saturated AGCout
// move i1 to x0(n-1),
/* r4.l = (a0 = r0.l * r7.l), r4.h = (a1 = r0.h * r7.l) (is) || i1 += m3 || [i2++] = r3;
mnop || r2 = [i0++m3] || [i2++] = r4; // move i0 to w4, save sat samp (r4), i2@AGC/LMS-out
mnop || r2 = [i0++m3] || [i2++] = r0; // move i0 to w2, save AGC samp (r0), i2@x0(n-1)
r2 = [i0++m3]; // move i0 to w0
r2 = [i0++]; // move i0 to LPF1 b0
nop; nop; nop; nop; nop; nop; nop; nop; nop; nop;
nop; nop; nop; nop; nop; nop; nop; nop; nop; nop;
nop; nop;
*/
// -------------------------------------------
// LMS
// saturate the sample (r4), save the AGCgain (r3),
// move i1 to saturated AGCout 7 channels before, i2@saturated AGCout
r4.l = (a0 = r0.l * r7.l), r4.h = (a1 = r0.h * r7.l) (is) || i1 += m1 || [i2++] = r3;
mnop || r1 = [i1++] || [i2++] = r4; // i1@AGCout of c-7, save sat sample (r4), i2@AGCout of c
mnop || r1 = [i1++m2] || r2 = [i0++]; // r1=AGCout(c-7), i1@AGCout(c-6), r2=LMS w6, i0@LMS w5
// LMS summation
a0 = r1.l * r2.l, a1 = r1.h * r2.h || r1 = [i1++m2] || r2 = [i0++];
a0+= r1.l * r2.l, a1+= r1.h * r2.h || r1 = [i1++m2] || r2 = [i0++];
a0+= r1.l * r2.l, a1+= r1.h * r2.h || r1 = [i1++m2] || r2 = [i0++];
a0+= r1.l * r2.l, a1+= r1.h * r2.h || r1 = [i1++m2] || r2 = [i0++];
a0+= r1.l * r2.l, a1+= r1.h * r2.h || r1 = [i1++m2] || r2 = [i0++];
a0+= r1.l * r2.l, a1+= r1.h * r2.h || r1 = [i1++m2] || r2 = [i0++];
// r1=AGCout(c-1), i1@AGCout(c), r2=LMS w0, i0@LPF b0
r6.l = (a0+= r1.l * r2.l), r6.h = (a1+= r1.h * r2.h) || r1 = [i1++m1] || r2 = [i0++m0];
r0 = r0 -|- r6 (s) || [i2++] = r0 || r1 = [i1--];
r6 = r0 >>> 15 (v,s) || r1 = [i1++m2] || r2 = [i0++];
.align 8
a0 = r2.l * r7.h, a1 = r2.h * r7.h || nop || nop; // weight*LMS_enable
r5.l = (a0 += r1.l * r6.l), r5.h = (a1 += r1.h * r6.h) || r1 = [i1++m2] || r2 = [i0--];
a0 = r2.l * r7.h, a1 = r2.h * r7.h || [i0++m3] = r5;
r5.l = (a0 += r1.l * r6.l), r5.h = (a1 += r1.h * r6.h) || r1 = [i1++m2] || r2 = [i0--];
a0 = r2.l * r7.h, a1 = r2.h * r7.h || [i0++m3] = r5;
r5.l = (a0 += r1.l * r6.l), r5.h = (a1 += r1.h * r6.h) || r1 = [i1++m2] || r2 = [i0--];
a0 = r2.l * r7.h, a1 = r2.h * r7.h || [i0++m3] = r5;
r5.l = (a0 += r1.l * r6.l), r5.h = (a1 += r1.h * r6.h) || r1 = [i1++m2] || r2 = [i0--];
a0 = r2.l * r7.h, a1 = r2.h * r7.h || [i0++m3] = r5;
r5.l = (a0 += r1.l * r6.l), r5.h = (a1 += r1.h * r6.h) || r1 = [i1++m2] || r2 = [i0--];
a0 = r2.l * r7.h, a1 = r2.h * r7.h || [i0++m3] = r5;
r5.l = (a0 += r1.l * r6.l), r5.h = (a1 += r1.h * r6.h) || r1 = [i1++m2] || r2 = [i0--];
a0 = r2.l * r7.h, a1 = r2.h * r7.h || [i0++m3] = r5;
r5.l = (a0 += r1.l * r6.l), r5.h = (a1 += r1.h * r6.h) || r1 = [i1++m3] || r2 = [i0--];
mnop || [i0++] = r5; // save w0, i0@LPF1 b0
.align 8
/* Start 2 back-to-back direct-form 1 biquads. At this point,
r0 = final AGC sample
i0 @ LPF1 b0
i1 @ x0(n-1) of current channel.
i2 @ x0(n-1) of current channel.
If i1 and i2 are dereferenced (read out) in the same cycle, the processor will stall -- each of the
1k SRAM memory banks has only one port.
*/
r5 = [i0++] || r1 = [i1++]; // r5=b0(LPF), r1=x0(n-1); i0@b1(LPF), i1@x0(n-2)
a0 = r0.l * r5.l, a1 = r0.h * r5.h || r6 = [i0++] || r2 = [i1++]; // r6=b1(LPF), r2=x0(n-2)
a0 += r1.l * r6.l, a1 += r1.h * r6.h || r7 = [i0++] || r3 = [i1++]; // r7=a0(LPF), r3=y1(n-1)
a0 += r2.l * r5.l, a1 += r2.h * r5.h || r5 = [i0++] || r4 = [i1++]; // r5=a1(LPF), r4=y1(n-2)
a0 += r3.l * r7.l, a1 += r3.h * r7.h || [i2++] = r0; // update x0(n-1), i2@x0(n-2)
r0.l = (a0 += r4.l * r5.l), r0.h = (a1 += r4.h * r5.h) (s2rnd) || [i2++] = r1; // r0=y1(n), update x0(n-2)
// now i0@b0(HPF1), i1@y2(n-1). i2@y1(n-1)
r5 = [i0++] || [i2++] = r0; // r5=b0(HPF1), update y1(n-1); i0@b1(HPF1), i2@y1(n-2)
a0 = r0.l * r5.l, a1 = r0.h * r5.h || r6 = [i0++] || [i2++] = r3; // r6=b1(HPF1). update y1(n-2)
a0 += r3.l * r6.l, a1 += r3.h * r6.h || r7 = [i0++] || r1 = [i1++]; // r7=a0(HPF1). r1=y2(n-1)
a0 += r4.l * r5.l, a1 += r4.h * r5.h || r2 = [i1++]; // r2=y2(n-2)
a0 += r1.l * r7.l, a1 += r1.h * r7.h || r5 = [i0++]; // r5=a1(HPF1)
r3.l = (a0 += r2.l * r5.l), r3.h = (a1 += r2.h * r5.h) (s2rnd); // r3=y2(n)
// now i0@template_A(t); i1@next sample; i2@y2(n-1)
[i2++] = r3; // save current y2(n) in y2(n-1)'s spot. i2 @ y2(n-2)
r2 = [sp++] || [i2++] = r1; // save current y2(n-1) in y2(n-2)'s spot. i2 @ next sample.
// read back ch[0,32]'s results from stack to r2
// Template comparison, plexon style. Sliding window, no threshold.
// r2=samples from amp1 and amp2; r3=samples from amp3 and amp4. Pack them
r2 = r2 >>> 8 (v); // vector shift 16-bit to 8 bits, preserve sign
r3 = r3 >>> 8 (v);
r4 = bytepack(r2, r3); // amp4,3,2,1 (hi to low byte)
r0 = [FP - FP_8080]; // r0=0x80808080;
r4 = r4 ^ r0; // convert to unsigned offset binary. SAA works on unsigned numbers.
r0 = r4; // save a copy for later
.align 8; // template A: load order is t, t-15, t-14,...t-1
a0 = a1 = 0 || r2 = [i0++]; //r2=template_A(t), r0 and r4 contains byte-packed samples just collected
saa( r1:0, r3:2 ) || r0 = [i3++] || r2 = [i0++]; //r2=template_A(t-15), r0=bytepack(t-15)
saa( r1:0, r3:2 ) || r0 = [i3++] || r2 = [i0++]; //r2=template_A(t-14), r0=bytepack(t-14)
saa( r1:0, r3:2 ) || r0 = [i3++] || r2 = [i0++]; //r2=template_A(t-13), r0=bytepack(t-13)
saa( r1:0, r3:2 ) || r0 = [i3++] || r2 = [i0++]; //r2=template_A(t-12), r0=bytepack(t-12)
saa( r1:0, r3:2 ) || r0 = [i3++] || r2 = [i0++]; //r2=template_A(t-11), r0=bytepack(t-11)
saa( r1:0, r3:2 ) || r0 = [i3++] || r2 = [i0++]; //r2=template_A(t-10), r0=bytepack(t-10)
saa( r1:0, r3:2 ) || r0 = [i3++] || r2 = [i0++]; //r2=template_A(t-9), r0=bytepack(t-9)
saa( r1:0, r3:2 ) || r0 = [i3++] || r2 = [i0++]; //r2=template_A(t-8), r0=bytepack(t-8)
saa( r1:0, r3:2 ) || r0 = [i3++] || r2 = [i0++]; //r2=template_A(t-7), r0=bytepack(t-7)
saa( r1:0, r3:2 ) || r0 = [i3++] || r2 = [i0++]; //r2=template_A(t-6), r0=bytepack(t-6)
saa( r1:0, r3:2 ) || r0 = [i3++] || r2 = [i0++]; //r2=template_A(t-5), r0=bytepack(t-5)
saa( r1:0, r3:2 ) || r0 = [i3++] || r2 = [i0++]; //r2=template_A(t-4), r0=bytepack(t-4)
saa( r1:0, r3:2 ) || r0 = [i3++] || r2 = [i0++]; //r2=template_A(t-3), r0=bytepack(t-3)
saa( r1:0, r3:2 ) || r0 = [i3++] || r2 = [i0++]; //r2=template_A(t-2), r0=bytepack(t-2)
saa( r1:0, r3:2 ) || r0 = [i3++] || r2 = [i0++]; //r2=template_A(t-2), r0=bytepack(t-1)
saa( r1:0, r3:2 ) || [i3++] = r4; // write bytepack(t), inc i3
// saa results in a0.l, a0.h, a1.l, a1.h (amp4,3,2,1); compare to aperture
// i0 @ Aperture[amp1A, amp2A]
r0 = a0, r1 = a1 (fu) || r2 = [i0++] || i3 += m0; // r2=aperture[amp1A,amp2A], i3@saved bytepack(t-6)
// subtract and saturate - if the answer is negative-->spike!
r0 = r0 -|- r2 (s) || r3 = [i0++] || i3 -= m3; // r0=[amp1A match, amp2A match], r3=aperture[amp3A,amp4A]
// i3@saved bytepack(t-8)
r1 = r1 -|- r3 (s) || i3 += m0; // r1=[amp3A match, amp4A match]; i3@bytepack(t-15) again.
// shift to bit 0, sign preserved
r0 = r0 >>> 15 (v); // r0.h and r0.l will be 0xffff or 0x0000 (match or not)
r1 = r1 >>> 15 (v);
r0 = -r0 (v); // now 0x0001 or 0x0000 (match or not)
r1 = -r1 (v); // save both for packing later
r1 << = 1;
r6 = r0 + r1; // r6=[14 zeros][amp4A][amp2A][14 zeros][amp3A][amp1A]
.align 8; // template B: load order is t-15, t-14,...,t
a0 = a1 = 0 || r0 = [i3++] || r2 = [i0++]; // r2=template_B(t-15), r0=bytepack(t-15)
saa( r1:0, r3:2 ) || r0 = [i3++] || r2 = [i0++]; // r2=template_B(t-14), r0=bytepack(t-14)
saa( r1:0, r3:2 ) || r0 = [i3++] || r2 = [i0++]; // r2=template_B(t-13), r0=bytepack(t-13)
saa( r1:0, r3:2 ) || r0 = [i3++] || r2 = [i0++]; // r2=template_B(t-12), r0=bytepack(t-12)
saa( r1:0, r3:2 ) || r0 = [i3++] || r2 = [i0++]; // r2=template_B(t-11), r0=bytepack(t-11)
saa( r1:0, r3:2 ) || r0 = [i3++] || r2 = [i0++]; // r2=template_B(t-10), r0=bytepack(t-10)
saa( r1:0, r3:2 ) || r0 = [i3++] || r2 = [i0++]; // r2=template_B(t-9), r0=bytepack(t-9)
saa( r1:0, r3:2 ) || r0 = [i3++] || r2 = [i0++]; // r2=template_B(t-8), r0=bytepack(t-8)
saa( r1:0, r3:2 ) || r0 = [i3++] || r2 = [i0++]; // r2=template_B(t-7), r0=bytepack(t-7)
saa( r1:0, r3:2 ) || r0 = [i3++] || r2 = [i0++]; // r2=template_B(t-6), r0=bytepack(t-6)
saa( r1:0, r3:2 ) || r0 = [i3++] || r2 = [i0++]; // r2=template_B(t-5), r0=bytepacK(t-5)
saa( r1:0, r3:2 ) || r0 = [i3++] || r2 = [i0++]; // r2=template_B(t-4), r0=bytepack(t-4)
saa( r1:0, r3:2 ) || r0 = [i3++] || r2 = [i0++]; // r2=template_B(t-3), r0=bytepack(t-3)
saa( r1:0, r3:2 ) || r0 = [i3++] || r2 = [i0++]; // r2=template_B(t-2), r0=bytepacK(t-2)
saa( r1:0, r3:2 ) || r0 = [i3++] || r2 = [i0++]; // r2=template_B(t-1), r0=bytepack(t-1)
saa( r1:0, r3:2 ) || r0 = [i3++] || r2 = [i0++]; // r2=template_B(t), r0=bytepack(t)
saa( r1:0, r3:2 ); // i0@Aperture[amp1B,amp2B], i3@nbytepack(t-15) for next set of channels
// saa result in a0.l, a0.h, a1.l, a1.h.
r0 = a0, r1 = a1 (fu) || r2 = [i0++]; // r2=aperture[amp1B,amp2B]
// subtract and saturate - if answer is negative-->spike!
r0 = r0 -|- r2 (s) || r3 = [i0++]; // r3=aperture[amp3B, amp4B], i0@integrator coeffs for next set
r1 = r1 -|- r3 (s);
// shift to bit 0, sign preserved
r0 = r0 >>> 15 (v);
r1 = r1 >>> 15 (v);
r0 = -r0 (v);
r1 = -r1 (v);
r1 <<= 3;
r0 <<= 2;
r0 = r0 + r1; // r0=[12 zeros][amp4B][amp2B][14 zeros][amp3B][amp1B][2 zeros]
r0 = r0 + r6; // add to tempA matches.
/*
r0 currently is:
r0.h = [12 zeros][amp4B][amp2B][amp4A][amp2A]
r0.l = [12 zeros][amp3B][amp1B][amp3A][amp1A]
Need to merge the two nibbles into a byte. Then record down the match results.
*/
r0.h = r0.h << 4;
r0.l = r0.l + r0.h; // r0.l is now [8 zeros][amp4B][amp2B][amp4A][amp2A][amp3B][amp1B][amp3A][amp1A]
r0 = r0.b (z);
p0 = [FP - FP_CHAN];
p1 = [FP - FP_ENC_LUT_BASE]; // 3 cycle latency before we can use p1 (??why?)
p5 = [FP - FP_MATCH_BASE];
r6 = p0; // r6 = current group of channels
p5 = p5 + p0; // p5 = address to write the template matching byte to
r1 = b[p5];
r1 = r1 | r0; // Need to OR in case both template get spikes within the time of a pkt
p0 = r1; // p0=match-byte, act as offset to 8bit-to-7bit LUT
b[p5] = r1; // Update the 8bit template matches.
// After updating the template matches for this group, need to update its corresponding 7bit encoding.
// This 7bit byte will be added into the radio packet.
p3 = 32;
p5 = p5+p3; // Corresponding location in 7bit region for this group of chans
p1 = p1+p0; // Get corresponding LUT address,
r2 = b[p1]; // read 7bit encoding from LUT,
b[p5] = r2; // write encoding to 7bit region
r0 = 31;
cc = r6 == r0;
if !cc jump end_txchan (bp);
// Add to radio packet if current group is 31
// p0 and p5 are free; p1 and p3 are static and reset below.
// p4 points to WFBUF - location in a pkt we are writing the new samples to
// Below we write new samples to current packet.
p0 = [FP - FP_TXCHAN0];
p1 = [FP - FP_TXCHAN1];
p3 = [FP - FP_TXCHAN2];
p5 = [FP - FP_TXCHAN3];
r0 = b[p0];
r1 = b[p1];
r2 = b[p3];
r3 = b[p5];
b[p4++] = r0;
b[p4++] = r1;
b[p4++] = r2;
b[p4++] = r3;
r4 = [FP - FP_QS]; // r4 = Bytes of samples for each channel in pkt: 0-6
r7 = 6;
r4 += 1;
i3 += 4; // End of an iteration, previous bytepack(t-14) becomes the next bytepack(t-15)
cc = r4 == r7;
if !cc jump end_txchan_qs (bp);
// if we don't have 6 samples for each TXCHANx yet, then continue;
// otherwise add the template matches and finish constructing the pkt
r5 = [FP - FP_QPACKETS]; // r5 = Number of queued pkts, 0-15
p1 = r5; // Used to index to state_lut; hide 4 cycle latency
p0 = [FP - FP_STATE_LUT_BASE];
p5 = [FP - FP_MATCH_PTR7]; // p5=start of 7bit templ matches record
r5 += 1; // Add one more packet on the queue.
[FP - FP_QPACKETS] = r5; // Update # of queued pkts
p0 = (p0 + p1) << 2; // Address to find frame#-in-pkt (QS) mask. Table is 4 byte align
r5 = [FP - FP_ECHO]; // r5 = Echo from gtkclient - echo flag in bits 31, 23, 15, 7.
r7 = [p0]; // r7 = QS mask
// read in the 7bit template matches (2 32-bit words per pkt)
r0 = [p5++];
r1 = [p5++];
// Flag upper bits in each byte with QS & echo bits
r0 = r0 | r7;
r1 = r1 | r5;
// Reset the area just read in 7bit and 8bit MATCH memory
r7 = p5; // r7 = Address in 7bit area right now
p0 = -36;
p5 = p5 + p0; // p5 = corresponding address in 8bit (32+4 for post-inc)
bitclr(r7,6); // Reset r7 pointer
bitset(r7,5); // to keep it within 7b-encoding region
[FP - FP_MATCH_PTR7] = r7; // and write it back
// Write masked template match to pkt
[p4++] = r0;
[p4++] = r1;
r4 = 0; // Clear sample count in qs (also reset templat match in 8b region).
[p5--] = r4; // Reset template match,
[p5--] = r4; // in 8bit region
end_txchan_qs:
[FP - FP_QS] = r4; // if we just finished a pkt, QS=0, otherwise it was incremented already.
// make p4 loop in WFBUF region
r7 = p4;
bitclr(r7, 10); // two 512-byte frames. loop back after going through 1024 bytes.
p4 = r7;
end_txchan:
p1 = [FP - FP_FIO_FLAG_D]; // reset the pointers.
p3 = [FP - FP_SPI_TDBR];
rts;
_clearirq_asm: //just write the status register via spi to clear.
[--sp] = rets;
r7 = SPI_CSN;
w[p1 + (FIO_FLAG_C - FIO_FLAG_D)] = r7;
call _get_asm;
r6 = NOR_STATUS + 0x20; //write to status register
w[p3] = r6;
call _get_asm;
r6 = 0x70;
w[p3] = r6; //clear irq: set RX_DR to clear bit, set TX_DS to clear bit, set MAX_RT to clear bit
call _get_asm;
r7 = SPI_CSN;
w[p1 + (FIO_FLAG_S - FIO_FLAG_D)] = r7;
call _get_asm;
rets = [sp++];
rts;
_waitirq_asm:
[--sp] = rets;
r7 = 182; // should take max 360us = 178. min @ 1msps = 160; original value=182
[fp - FP_TIMER] = r7;
waitirq_loop:
r6 = w[p1];
cc = bittst(r6, 3); //irq pin.
if !cc jump waitirq_end;
call _get_asm;
r6 = 0;
r7 = [fp - FP_TIMER];
cc = r6 == r7;
if cc jump waitirq_end;
r7 += -1;
[fp - FP_TIMER] = r7;
call _get_asm;
jump waitirq_loop;
waitirq_end:
call _get_asm;
rets = [sp++];
rts;
_clearfifos_asm:
// p3 points to SPI_TBDR, restored from _get_asm
// sending to nordic thru SPI follows clearing CSN, preceeds setting CSN
[--sp] = rets;
// flush RX fifo (seems to improve reliability)
r7 = SPI_CSN;
w[p1 + (FIO_FLAG_C - FIO_FLAG_D)] = r7;
call _get_asm;
r6 = NOR_FLUSH_RX;
w[p3] = r6; // option sent thru SPI when SPI_TBDR wrttien to.
call _get_asm;
r7 = SPI_CSN;
w[p1 + (FIO_FLAG_S - FIO_FLAG_D)] = r7;
call _get_asm;
// and flush TX fifo.
r7 = SPI_CSN;
w[p1 + (FIO_FLAG_C - FIO_FLAG_D)] = r7;
call _get_asm;
r6 = NOR_FLUSH_TX;
w[p3] = r6;
call _get_asm;
r7 = SPI_CSN;
w[p1 + (FIO_FLAG_S - FIO_FLAG_D)] = r7;
call _get_asm;
// return.
rets = [sp++];
rts;
.global _radio_bidi_asm // start of firmware execution
_radio_bidi_asm:
// set SPI baudrate
p5.l = LO(SPI_BAUD);
p5.h = HI(SPI_BAUD);
r0 = 4 (z); // For a core clock of 80MHz, run it at 10MHz.
// Nordic has 8Mhz as highest SCK frequency, but 10MHz has been working for Tim.
w[p5] = r0; // must be 16-bits-word-sized-write
//------------------- Initialize variable and state pointers ---------------------
// FP : points to FP_BASE
fp.l = LO(FP_BASE);
fp.h = HI(FP_BASE);
// P0: free, though usually points to Serial port RX data register
p0.l = LO(SPORT0_RX);
p0.h = HI(SPORT0_RX);
[FP - FP_SPORT0_RX] = p0;
// P1: portF - FIO_FLAG_D (can be used for other stuff too)
p1.l = LO(FIO_FLAG_D);
p1.h = HI(FIO_FLAG_D);
[FP - FP_FIO_FLAG_D] = p1;
// P2: pointer for READING through WFBUF when transfering packet data for radio to transmit
p2.l = LO(WFBUF);
p2.h = HI(WFBUF);
// P3: SPI TDBR - writting SPI commands/data to radio
p3.l = LO(SPI_TDBR);
p3.h = HI(SPI_TDBR);
[FP - FP_SPI_TDBR] = p3;
// P4: point for WRITING to WFBUF sampels and template matches when signal chain finishes.
p4.l = LO(WFBUF);
p4.h = HI(WFBUF);
// FP_CHAN: points to the CURRENT channel we are working on - pre-incremented.
r0 = 30(z); // need to account for the pipeline stuff...
[FP - FP_CHAN] = r0;
/* FP_QS: points to how many samples we have currently written to a packet -- 6 samples
* per transmitted channel, per packet.
*/
r0 = 0;
[FP- FP_QS] = r0;
// FP_QPACKETS: points to number of queued packets -- 16 packets in a frame before transmitting to radio
[FP - FP_QPACKETS] = r0;
// FP_8080, FP_5555, FP_aaaa holds (name-sake) masks to be used.
r0.l = 0x8080;
r0.h = 0x8080;
[FP - FP_8080] = r0; // used to convert signed binary to unsigned offset binary.
r0.l = 0x5555;
r0.h = 0x5555;
[FP - FP_5555] = r0; // doesn't seem to be used...
r0.l = 0xaaaa;
r0.h = 0xaaaa;
[FP - FP_AAAA] = r0; // doesn't seem to be used...
// LMS gain and weight decay
r0.l = 0x0003; // lower word=3
r0.h = 0x7fff; // higher word=0x7fff = (Q15)0.9999. LMS weight decay.
[FP - FP_WEIGHTDECAY] = r0;
// Template match buffer. Each byte holds template matches in the form:
// [ch 64B][ch 0B][ch 64A][ch 0A][ch 96B][ch 32B][ch 96A][ch 32A] (from low to high addr)
r0.l = LO(MATCH);
r0.h = HI(MATCH);
[FP - FP_MATCH_BASE] = r0; // FP_MATCH_BASE points to start of template-match buffer.
r0.l = LO(MATCH + 32);
r0.h = HI(MATCH + 32);
[FP - FP_MATCH_PTR7] = r0; // FP_MATCH_PTR7 points to the 7b-encoded version of temp-matches.
// LED blinker state
r0 = 0;
[FP - FP_BLINK] = r0;
// template encoding and state (echo bits) look up table
r0.l = LO(ENC_LUT);
r0.h = HI(ENC_LUT);
[FP - FP_ENC_LUT_BASE] = r0; // addressed byte-wise; no align needed.
r0.l = LO(STATE_LUT);
r0.h = HI(STATE_LUT);
r0 = r0 >> 2; // because we use add with shift for 4-byte word alignment.
[FP - FP_STATE_LUT_BASE] = r0; // this makes future binary math easy (see later echo-flag extraction)
/* Setup default pointers of values to transmit.
Memory layout is little-endian.
Note that only 8-bits of the selected sample can be transmitted in a radio-sample. Thus
the pointer should point to the higher byte of 16-bits word holding that sample.
To select values for [channel 1, 33, 65, 97], for example, use W1+W1_STRIDE*2*4*1 instead of W1 to
address values in W1 (each group of 4 channels occupy 2 32-bit words), where 1~[0,31] is the which
4-channel group we want.
*/
// channel 0
r0.l = LO(W1 + 1); // ch0 Intan sample output
r0.h = HI(W1 + 1);
[FP - FP_TXCHAN0] = r0;
// channel 31
r0.l = LO(W1 + W1_STRIDE*2*4*31 + 1);
r0.h = HI(W1 + W1_STRIDE*2*4*31 + 1);
[FP - FP_TXCHAN1] = r0;
// channel 32
r0.l = LO(W1 + 2 + 1)
r0.h = HI(W1 + 2 + 1);
[FP - FP_TXCHAN2] = r0;
// channel 63
r0.l = LO(W1 + W1_STRIDE*2*4*31 + 2 + 1);
r0.h = HI(W1 + W1_STRIDE*2*4*31 + 2 + 1);
[FP - FP_TXCHAN3] = r0;
/* PF pin setup:
Make sure we aren't driving anything we need not be.
- SPI_CE = PF0 -- initially high to deselect - Output - radio
- SPI_CSN = PF1 -- initially high to deselect - Output - radio
- LED_BLINK = PF4 -- initially high to light LED - Output
- SPI_CS = PF2 -- flash memory - input
- SPI_IRQ = PF3 -- radio IRQ - input
FIO_DIR = 0b 0000 0000 0001 0011 (1 = output, 0 = input)
*/
r0 = SPI_CE | SPI_CSN | LED_BLINK;
w[p1 + (FIO_DIR - FIO_FLAG_D)] = r0;
// Enable input buffer for only radio's _IRQ pin
r0 = SPI_IRQ;
w[p1 + (FIO_INEN - FIO_FLAG_D)] = r0;
ssync;
//--------------------- Signal chain coefficients -----------------
// A1 is the coefficient circuluar buffer. i0 is the writing pointer
i0.l = LO(A1);
i0.h = HI(A1);
l0 = A1_STRIDE*32*4; // A1_STRIDE: number of 32-bit words * 4 bytes/word * 32 four-channel groups
m0 = -7*4; // for moving back to start of LMS weights (1 group = 7 prev ch for cur samp)
b0 = i0;
// i1 is for reading the W1 circular buffer -- contains the value of the samples
i1.l = LO(W1);
i1.h = HI(W1);
l1 = W1_STRIDE*2*32*4; // 2 32-bit words/group, 32 group total. This is number of bytes
m1 = -7*W1_STRIDE*2*4; // Moving back to W1-data slot 7 channels (before current one)
b1 = i1;
// i2 is for writing delays in the W1 circular buffer. i2 usually lags i1 (i.e. update values).
i2 = i1;
l2 = l1;
m2 = W1_STRIDE*2*4; // moving forward one channel in W1-data
b2 = b1;
// i3 is for reading/writing template delays (post-filter)
i3.l = LO(T1);
i3.h = HI(T1);
l3 = T1_LENGTH;
b3 = i3;
m3 = 2*4; // 2 32-bit words -- used to jump to read point for LMS
/* Next, we go through circular buffer A1, and set up the coefficients needed for the
signal chain.
Each group of 4 channels require 4 32-bit words for AGC.
(see memory_AGC.h and radio_AGC_memoryMap.ods).
The memory structure of A1 is then made up of 32 A1_strides(also see memory_map spreadsheet),
each A1 stride contains the following:
Aperture B (2 32-bit words) <--- high address
Templates B (16 32-bit words)
Aperture A (2 32-bit words)
Templates A (16 32-bit words)
IIR coefs (8 32-bit words)
LMS coefs (7 32-bit words)
AGC coefs (2 32-bit words)
Integ coefs (2 32-bit words)
IIR coefs (8 32-bit words)
LMS coefs (7 32-bit words)
AGC coefs (2 32-bit words)
Integ coefs (2 32-bit words) <--- low address
There are a total of 32 groups of 4-channels. Each each loop of lt_top, we populate 1 A1-stride.
Each lt2_top loop populates the 2 of the 4-channels in a particular group.
Our counter will be at 63, this makes our pointers start at the first block, corresponding to
starting sample acquisition at ch[31,63,95,127]
*/
p5 = 32+31 (x);
lsetup(lt_top, lt_bot) lc0 = p5; // each loop of lt_top is one A1_stride
lt_top:
p5 = 2;
lsetup(lt2_top, lt2_bot) lc1 = p5; // each lt2_top is one set of coefs
lt2_top:
// gain and integrator
// Assuming input samples are 12-bits unsigned.
/*
r0.l = 32767; w[i0++] = r0.l;
r0.l = -16384; w[i0++] = r0.l;
r0.l = 16384; w[i0++] = r0.l;
r0.l = 800; w[i0++] = r0.l;
*/
// Assuming input samples are 16-bits signed. Diff from before since no s2rnd
r0.l = 0x7fff; w[i0++] = r0.l; // (Q15) 1
r0.l = 0x8000; w[i0++] = r0.l; // (Q15) -1
r0.l = 0x7fff; w[i0++] = r0.l; // (Q15) 1
r0.l = 1600; w[i0++] = r0.l; // (Q15) 0.0488
/* AGC:
Target is 6000*16384, which in Q15 is ~0.09155. We store just the target's square
root (in Q15). This is so we can later accomodate big target values (32 bits).
*/
r0.l = 9915; w[i0++] = r0.l; // AGC target sqrt = sqrt(6000*16384), Q15.
r0.l = 9915; w[i0++] = r0.l; // AGC target sqrt = sqrt(6000*16384), Q15.
r0.l = 16384; w[i0++] = r0.l; // Q15, =0.5
r0.l = 1; w[i0++] = r0.l; // Set this to zero to disable AGC. Q7.8
/* LMS coefs
Initialized to be 0 (7 32-bit words)
*/
r0 = 0 (x);
[i0++] = r0;
[i0++] = r0;
[i0++] = r0;
[i0++] = r0;
[i0++] = r0;
[i0++] = r0;
[i0++] = r0;
/* IIR filter, biquad implementation. */
// LPF1: 7000Hz cutoff
//r0 = 4041 (x); w[i0++] = r0.l; w[i0++] = r0.l; // b0
//r0 = 8081 (x); w[i0++] = r0.l; w[i0++] = r0.l; // b1
//r0 = 3139 (x); w[i0++] = r0.l; w[i0++] = r0.l; // a0
//r0 = -2917(x); w[i0++] = r0.l; w[i0++] = r0.l; // a1
// LPF1: 9000Hz cutoff
r0 = 6004 (x); w[i0++] = r0.l; w[i0++] = r0.l; // b0
r0 = 12008(x); w[i0++] = r0.l; w[i0++] = r0.l; // b1
r0 = -4594(x); w[i0++] = r0.l; w[i0++] = r0.l; // a0
r0 = -3039(x); w[i0++] = r0.l; w[i0++] = r0.l; // a1
// HPF1: 250Hz cutoff
//r0 = 15812 (x); w[i0++] = r0.l; w[i0++] = r0.l; // b0
//r0 = -31624(x); w[i0++] = r0.l; w[i0++] = r0.l; // b1
//r0 = 31604 (x); w[i0++] = r0.l; w[i0++] = r0.l; // a0
//r0 = -15260(x); w[i0++] = r0.l; w[i0++] = r0.l; // a1
// HPF1: 500Hz cutoff
r0 = 15260 (x); w[i0++] = r0.l; w[i0++] = r0.l; // b0
r0 = -30519(x); w[i0++] = r0.l; w[i0++] = r0.l; // b1
r0 = 30442 (x); w[i0++] = r0.l; w[i0++] = r0.l; // a0
r0 = -14213(x); w[i0++] = r0.l; w[i0++] = r0.l; // a1
lt2_bot: nop;
// After all the filtering, templates for each channel/neuron
// 127 113 102 111 132 155 195 235 250 224 187 160 142 126
r0 = 127; r1 = r0 << 8; r2 = r1 << 8; r3 = r2 << 8; //this is the 'new' sample.
r0 = r0 | r1; r0 = r0 | r2; r0 = r0 | r3; [i0++] = r0; //16 template b is in order.
r0 = 113; r1 = r0 << 8; r2 = r1 << 8; r3 = r2 << 8; //not efficient but whatever.
r0 = r0 | r1; r0 = r0 | r2; r0 = r0 | r3; [i0++] = r0; //1
r0 = 102; r1 = r0 << 8; r2 = r1 << 8; r3 = r2 << 8;
r0 = r0 | r1; r0 = r0 | r2; r0 = r0 | r3; [i0++] = r0; //2
r0 = 111; r1 = r0 << 8; r2 = r1 << 8; r3 = r2 << 8;
r0 = r0 | r1; r0 = r0 | r2; r0 = r0 | r3; [i0++] = r0; //3
r0 = 132; r1 = r0 << 8; r2 = r1 << 8; r3 = r2 << 8;
r0 = r0 | r1; r0 = r0 | r2; r0 = r0 | r3; [i0++] = r0; //4
r0 = 155; r1 = r0 << 8; r2 = r1 << 8; r3 = r2 << 8;
r0 = r0 | r1; r0 = r0 | r2; r0 = r0 | r3; [i0++] = r0; //5
r0 = 195; r1 = r0 << 8; r2 = r1 << 8; r3 = r2 << 8;
r0 = r0 | r1; r0 = r0 | r2; r0 = r0 | r3; [i0++] = r0; //6
r0 = 235; r1 = r0 << 8; r2 = r1 << 8; r3 = r2 << 8;
r0 = r0 | r1; r0 = r0 | r2; r0 = r0 | r3; [i0++] = r0; //7
r0 = 250; r1 = r0 << 8; r2 = r1 << 8; r3 = r2 << 8;
r0 = r0 | r1; r0 = r0 | r2; r0 = r0 | r3; [i0++] = r0; //8
r0 = 224; r1 = r0 << 8; r2 = r1 << 8; r3 = r2 << 8;
r0 = r0 | r1; r0 = r0 | r2; r0 = r0 | r3; [i0++] = r0; //9
r0 = 187; r1 = r0 << 8; r2 = r1 << 8; r3 = r2 << 8;
r0 = r0 | r1; r0 = r0 | r2; r0 = r0 | r3; [i0++] = r0; //10
r0 = 160; r1 = r0 << 8; r2 = r1 << 8; r3 = r2 << 8;
r0 = r0 | r1; r0 = r0 | r2; r0 = r0 | r3; [i0++] = r0; //11
r0 = 142; r1 = r0 << 8; r2 = r1 << 8; r3 = r2 << 8;
r0 = r0 | r1; r0 = r0 | r2; r0 = r0 | r3; [i0++] = r0; //12
r0 = 126; r1 = r0 << 8; r2 = r1 << 8; r3 = r2 << 8;
r0 = r0 | r1; r0 = r0 | r2; r0 = r0 | r3; [i0++] = r0; //13
r0 = 120; r1 = r0 << 8; r2 = r1 << 8; r3 = r2 << 8;
r0 = r0 | r1; r0 = r0 | r2; r0 = r0 | r3; [i0++] = r0; //14
r0 = 110; r1 = r0 << 8; r2 = r1 << 8; r3 = r2 << 8;
r0 = r0 | r1; r0 = r0 | r2; r0 = r0 | r3; [i0++] = r0; //15
//aperture: default small.
r0.l = 56; r0.h = 56; [i0++] = r0; [i0++] = r0;
// 127 113 102 111 132 155 195 235 250 224 187 160 142 127
r0 = 127; r1 = r0 << 8; r2 = r1 << 8; r3 = r2 << 8; //not efficient but whatever.
r0 = r0 | r1; r0 = r0 | r2; r0 = r0 | r3; [i0++] = r0; //1
r0 = 113; r1 = r0 << 8; r2 = r1 << 8; r3 = r2 << 8;
r0 = r0 | r1; r0 = r0 | r2; r0 = r0 | r3; [i0++] = r0; //2
r0 = 102; r1 = r0 << 8; r2 = r1 << 8; r3 = r2 << 8;
r0 = r0 | r1; r0 = r0 | r2; r0 = r0 | r3; [i0++] = r0; //3
r0 = 111; r1 = r0 << 8; r2 = r1 << 8; r3 = r2 << 8;
r0 = r0 | r1; r0 = r0 | r2; r0 = r0 | r3; [i0++] = r0; //4
r0 = 132; r1 = r0 << 8; r2 = r1 << 8; r3 = r2 << 8;
r0 = r0 | r1; r0 = r0 | r2; r0 = r0 | r3; [i0++] = r0; //5
r0 = 155; r1 = r0 << 8; r2 = r1 << 8; r3 = r2 << 8;