-
Notifications
You must be signed in to change notification settings - Fork 3.4k
/
qnn.py
484 lines (386 loc) · 14.1 KB
/
qnn.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#pylint: disable=invalid-name
"""QNN dialect operators."""
from __future__ import absolute_import as _abs
from tvm.relay.expr import Tuple, TupleWrapper
from tvm.relay.op.nn.util import get_pad_tuple2d
from . import _make
def requantize(data,
input_scale,
input_zero_point,
output_scale,
output_zero_point,
axis=-1,
rounding="UPWARD",
out_dtype="int8"):
r"""Requantized operator.
The requantize operator converts one quantized tensor representation to
another quantized tensor representation. For the output tensor, we are
provided with output scale and zero point. The computation is as follows
Q_output = zp_output + (scale_input)/(scale_output) * (Q_input - zp_input)
Parameters
----------
data : tvm.relay.Expr
The input data to the operator.
input_scale: tvm.relay.Expr
The quantization scale for the input tensor.
input_zero_point: tvm.relay.Expr
The zero point of the input tensor.
output_scale: tvm.relay.Expr
The quantization scale for the output tensor.
output_zero_point: tvm.relay.Expr
The zero point of the output tensor.
axis : int
The channel axis for quantization. Default value is -1 which corresponds to the last axis.
rounding : string, optional
Defines the rounding direction when the value is midway between two
representable values.
out_dtype : str, optional
Specifies the output data type.
Returns
-------
result : tvm.relay.Expr
The computed result.
"""
return _make.requantize(data,
input_scale,
input_zero_point,
output_scale,
output_zero_point,
axis,
rounding,
out_dtype)
def quantize(data,
output_scale,
output_zero_point,
axis=-1,
out_dtype='int8'):
r""" Quantize op
This operator takes float32 as input and produces quantized int8 or unit8 as output.
The input tensor can be of any shape. The output shape is the same as input shape.
Q_output = clamp((round(input_tensor/output_scale) + output_zero_point),
out_dtype::min,
out_dtype::max)
Parameters
----------
data : tvm.relay.Expr
The input tensor to be quantized. Can be of type float32.
output_zero_point : tvm.relay.Expr
The output zero_point.
output_scale : tvm.relay.Expr
The output scale.
axis : int
The channel axis for quantization. Default value is -1 which corresponds to the last axis.
out_dtype : str, optional
The data type of the input tensor. Can be [int8, uint8, int32]
Returns
-------
result : tvm.relay.Expr
The computed result.
"""
return _make.quantize(data,
output_scale,
output_zero_point,
axis,
out_dtype)
def dequantize(data,
input_scale,
input_zero_point):
r""" Dequantize op
This operator takes quantized int8 and unit8 as input and produces
dequantized float32 as output. The output shape is the same as input shape. The input
tensor can be of any shape.
Parameters
----------
data : tvm.relay.Expr
The input tensor to be dequantized. Can be of type [int8, uint8].
input_zero_point : tvm.relay.Expr
The input zero_point.
input_scale : tvm.relay.Expr
The input scale.
Returns
-------
result : tvm.relay.Expr
The computed result.
"""
return _make.dequantize(data,
input_scale,
input_zero_point)
def concatenate(data,
input_scales,
input_zero_points,
output_scale,
output_zero_point,
axis):
"""Concatenate the quantized input tensors along the given axis.
Parameters
----------
data : Union(List[relay.Expr], Tuple[relay.Expr], TupleWrapper[relay.Expr])
The list of quantized tensors.
input_scales : List[relay.Expr]
The list of scales of input quantized tensors.
input_zero_points : List[relay.Expr]
The list of zero points of input quantized tensors.
output_scale : relay.Expr
The scale of the output quantized tensor.
output_zero_point : relay.Expr
The zero point of the output quantized tensor.
axis : int
The axis along which the tensors are concatenated.
Returns
-------
result: relay.Expr
The concatenated quantized tensor.
"""
if isinstance(data, (list, tuple)):
data = Tuple(data)
if isinstance(data, TupleWrapper):
data = data.tuple_value
if not isinstance(axis, int):
raise ValueError("For now, we only support integer axis")
input_scales = list(input_scales)
input_zero_points = list(input_zero_points)
return _make.concatenate(data,
Tuple(input_scales),
Tuple(input_zero_points),
output_scale,
output_zero_point,
axis)
def conv2d(data,
kernel,
input_zero_point,
kernel_zero_point,
input_scale,
kernel_scale,
kernel_size,
channels,
strides=(1, 1),
padding=(0, 0),
dilation=(1, 1),
groups=1,
data_layout="NCHW",
kernel_layout="OIHW",
out_layout="",
out_dtype="int32"):
r"""Quantized 2D convolution.
This operator convolves quantized data with quantized kernel. The scale of
the output quantized tensor is the product of the kernel_scale and
input_scale of the input quantized tensors. The zero point of the output
quantized tensor is 0. By default, the dtype of output is int32. Please also
refer to Requantize operator to understand how to scale back the int32
output to (u)int8.
Parameters
----------
data : tvm.relay.Expr
The input data to the operator.
kernel : tvm.relay.Expr
The kernel expressions.
input_zero_point: tvm.relay.Expr
The zero point of the data distribution.
kernel_zero_point: tvm.relay.Expr
The zero point of the quantized_kernel distribution.
input_scale: tvm.relay.Expr
The scale for the input tensor. The scale for the input tensor is
stored purely for convenience here. See more commentary below.
kernel_scale: tvm.relay.Expr
The scale for the weight tensor. The scale for the weight tensor is
stored for access to this during relay. This information is not
needed in the pass pipeline after qnn.conv2d is lowered to the
sequence of steps as in nn.conv2d. See also input_scale in Requantize.
kernel_size : tuple of int
The spatial width and height of the convolution kernel.
channels : int
Number of output channels of this convolution.
strides : tuple of int, optional
The strides of convolution.
padding : tuple of int, optional
The padding of convolution on both sides of inputs before convolution.
dilation : tuple of int, optional
Specifies the dilation rate to be used for dilated convolution.
groups : int, optional
Number of groups for grouped convolution.
data_layout : str, optional
Layout of the input.
kernel_layout : str, optional
Layout of the kernel.
out_layout : str, optional
Layout of the output, by default, out_layout is the same as data_layout
out_dtype : str, optional
Specifies the output data type for mixed precision conv2d.
Returns
-------
result : tvm.relay.Expr
The computed result.
"""
# TODO enforce 4-way padding in topi/nn/conv2d after #4644 merged
# convert 2-way padding to 4-way padding
padding = get_pad_tuple2d(padding)
return _make.conv2d(data, kernel,
input_zero_point, kernel_zero_point,
input_scale, kernel_scale,
strides, padding, dilation,
groups, channels, kernel_size,
data_layout, kernel_layout, out_layout, out_dtype)
def add(lhs,
rhs,
lhs_scale,
lhs_zero_point,
rhs_scale,
rhs_zero_point,
output_scale,
output_zero_point):
"""Quantized addition with numpy-style broadcasting.
Parameters
----------
lhs : relay.Expr
The left hand side quantized input data.
rhs : relay.Expr
The right hand side quantized input data.
lhs_scale: relay.Expr
The scale of the lhs quantized expr.
lhs_zero_point: relay.Expr
The zero point of lhs quantized expr.
rhs_scale: relay.Expr
The scale of the rhs quantized expr.
rhs_zero_point: relay.Expr
The zero point of rhs quantized expr.
output_scale: relay.Expr
The scale of the output quantized expr.
output_zero_point: relay.Expr
The zero point of output quantized expr.
Returns
-------
result : relay.Expr
The computed result.
"""
return _make.add(lhs, rhs,
lhs_scale, lhs_zero_point,
rhs_scale, rhs_zero_point,
output_scale, output_zero_point)
def dense(data,
weight,
input_zero_point,
kernel_zero_point,
input_scale,
kernel_scale,
units,
out_dtype="int32"):
"""Qnn Dense operator.
Applies a quantized linear transformation
.. math::
`Y = X * W`
Parameters
----------
data : tvm.relay.Expr
The quantized input data to the operator.
weight : tvm.relay.Expr
The quantized weight expressions.
input_zero_point: tvm.relay.Expr
The input zero point.
kernel_zero_point: tvm.relay.Expr
The kernel zero point.
input_scale: tvm.relay.Expr
The scale for the input tensor.
kernel_scale: tvm.relay.Expr
The scale for the weight tensor. The scale for the weight tensor is
stored for access to this during relay. This information is not
needed in the pass pipeline after qnn.conv2d is lowered to the
sequence of steps as in nn.conv2d. See also input_scale in Requantize.
units : int
Number of hidden units of the dense transformation.
out_dtype : str, optional
Specifies the output data type for mixed precision dense can be int32 or int16.
Returns
-------
result : tvm.relay.Expr
The computed result.
"""
return _make.dense(data,
weight,
input_zero_point,
kernel_zero_point,
input_scale,
kernel_scale,
units,
out_dtype)
def mul(lhs, rhs, lhs_scale, lhs_zero_point, rhs_scale, rhs_zero_point,
output_scale, output_zero_point):
"""Quantized multiplication with numpy-style broadcasting.
Parameters
----------
lhs : relay.Expr
The left hand side quantized input data.
rhs : relay.Expr
The right hand side quantized input data.
lhs_scale: relay.Expr
The scale of the lhs quantized expr.
lhs_zero_point: relay.Expr
The zero point of lhs quantized expr.
rhs_scale: relay.Expr
The scale of the rhs quantized expr.
rhs_zero_point: relay.Expr
The zero point of rhs quantized expr.
output_scale: relay.Expr
The scale of the output quantized expr.
output_zero_point: relay.Expr
The zero point of output quantized expr.
Returns
-------
result : relay.Expr
The computed result.
"""
return _make.mul(lhs, rhs,
lhs_scale, lhs_zero_point,
rhs_scale, rhs_zero_point,
output_scale, output_zero_point)
def subtract(lhs,
rhs,
lhs_scale,
lhs_zero_point,
rhs_scale,
rhs_zero_point,
output_scale,
output_zero_point):
"""Quantized subtraction with numpy-style broadcasting.
Parameters
----------
lhs : relay.Expr
The left hand side quantized input data.
rhs : relay.Expr
The right hand side quantized input data.
lhs_scale: relay.Expr
The scale of the lhs quantized expr.
lhs_zero_point: relay.Expr
The zero point of lhs quantized expr.
rhs_scale: relay.Expr
The scale of the rhs quantized expr.
rhs_zero_point: relay.Expr
The zero point of rhs quantized expr.
output_scale: relay.Expr
The scale of the output quantized expr.
output_zero_point: relay.Expr
The zero point of output quantized expr.
Returns
-------
result : relay.Expr
The computed result.
"""
return _make.subtract(lhs, rhs,
lhs_scale, lhs_zero_point,
rhs_scale, rhs_zero_point,
output_scale, output_zero_point)