-
Notifications
You must be signed in to change notification settings - Fork 11
/
an_itoa.c
430 lines (377 loc) · 10.8 KB
/
an_itoa.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
#include "an_itoa.h"
#include <string.h>
/*
* We only use functions for clarity here. Eyeballing -S output
* reveals GCC makes strange inlining decisions on this file,
* especially with AN_CC_LIKELY. Make sure our helpers are really
* inlined.
*/
#define INLINE AN_CC_INLINE static
typedef __uint128_t uint128_t;
/* Powers of 10. */
#define PO2 100ULL
#define PO4 10000ULL
#define PO8 100000000ULL
#define PO10 10000000000ULL
#define PO16 10000000000000000ULL
/* 64 bit's worth of '0' characters (i.e., 8 characters). */
#define ZERO_CHARS 0x3030303030303030ULL
/**
* encode_* functions unpack (pairs of) numbers into BCD: each byte
* contains exactly one decimal digit.
*
* The basic idea is to use SWAR (SIMD within a register) and perform
* low-precision arithmetic on several values in parallel.
*
* Most non-obviousness lies in the conversion of integer division
* constants to multiplication, shift and mask by hand. Decent
* compilers do it for scalars, but we can't easily express a SWAR
* integer division.
*
* The trick is to choose a low enough precision that the fixed point
* multiplication won't overflow into the next packed value (and high
* enough that the truncated division is exact for the relevant
* range), and to pack values so that the final result ends up in the
* byte we want. There are formulae to determine how little precision
* we need given an input range and a constant divisor, but for our
* cases, one can also check exhaustively (:
*
* The remainder is simple: given d = x / k, x % k = x - k * d.
*/
/**
* SWAR unpack [100 * hi + lo] to 4 decimal bytes, assuming hi and lo
* \in [0, 100)
*/
INLINE uint32_t
encode_hundreds(uint32_t hi, uint32_t lo)
{
/*
* Pack everything in a single 32 bit value.
*
* merged = [ hi 0 lo 0 ]
*/
uint32_t merged = hi | (lo << 16);
/*
* Fixed-point multiplication by 103/1024 ~= 1/10.
*/
uint32_t tens = (merged * 103UL) >> 10;
/*
* Mask away garbage bits between our digits.
*
* tens = [ hi/10 0 lo/10 0 ]
*
* On a platform with more restricted literals (ARM, for
* instance), it may make sense to and-not with the middle
* bits.
*/
tens &= (0xFUL << 16) | 0xFUL;
/*
* x mod 10 = x - 10 * (x div 10).
*
* (merged - 10 * tens) = [ hi%10 0 lo%10 0 ]
*
* Then insert these values between tens. Arithmetic instead
* of bitwise operation helps the compiler merge this with
* later increments by a constant (e.g., ZERO_CHARS).
*/
return tens + ((merged - 10UL * tens) << 8);
}
/**
* SWAR encode 10000 hi + lo to byte (unpacked) BCD.
*/
INLINE uint64_t
encode_ten_thousands(uint64_t hi, uint64_t lo)
{
uint64_t merged = hi | (lo << 32);
/* Truncate division by 100: 10486 / 2**20 ~= 1/100. */
uint64_t top = ((merged * 10486ULL) >> 20) & ((0x7FULL << 32) | 0x7FULL);
/* Trailing 2 digits in the 1e4 chunks. */
uint64_t bot = merged - 100ULL * top;
uint64_t hundreds;
uint64_t tens;
/*
* We now have 4 radix-100 digits in little-endian order, each
* in its own 16 bit area.
*/
hundreds = (bot << 16) + top;
/* Divide and mod by 10 all 4 radix-100 digits in parallel. */
tens = (hundreds * 103ULL) >> 10;
tens &= (0xFULL << 48) | (0xFULL << 32) | (0xFULL << 16) | 0xFULL;
tens += (hundreds - 10ULL * tens) << 8;
return tens;
}
/**
* Range-specialised version of itoa.
*
* We always convert to fixed-width BCD then shift away any leading
* zero. The slop will manifest as writing zero bytes after our
* encoded string, which is acceptable: we never write more than the
* maximal length (10 or 20 characters).
*/
/**
* itoa for x < 100.
*/
INLINE char *
itoa_hundred(char *out, unsigned int x)
{
/*
* -1 if x < 10, 0 otherwise. Tried to get an sbb, but ?:
* gets us branches.
*/
int small = (int)(x - 10) >> 8;
unsigned int base = (unsigned int)'0' | ((unsigned int)'0' << 8);
/*
* Probably not necessary, but why not abuse smaller constants?
* Also, see block comment above idiv_POx functions.
*/
unsigned int hi = (x * 103UL) >> 10;
unsigned int lo = x - 10UL * hi;
base += hi + (lo << 8);
/* Shift away the leading zero (shift by 8) if x < 10. */
base >>= small & 8;
memcpy(out, &base, 2);
/* 2 + small = 1 if x < 10, 2 otherwise. */
return out + 2 + small;
}
/**
* itoa for x < 10k.
*/
INLINE char *
itoa_ten_thousand(char *out, unsigned int x)
{
uint32_t buf;
unsigned int x_div_PO2, x_mod_PO2;
unsigned int zeros;
x_div_PO2 = (x * 10486UL) >> 20;
x_mod_PO2 = x - PO2 * x_div_PO2;
buf = encode_hundreds(x_div_PO2, x_mod_PO2);
/*
* Count leading (in memory, trailing in register: we're
* little endian) zero bytes: count leading zero bits and
* round down to 8.
*/
zeros = __builtin_ctz(buf) & -8U;
buf += ZERO_CHARS; /* BCD -> ASCII. */
buf >>= zeros; /* Shift away leading zero characters */
memcpy(out, &buf, 4);
/* zeros is in bits; convert to bytes to find actual length. */
return out + 4 - zeros / 8;
}
/**
* 32 bit helpers for truncation by constant.
*
* We only need them because GCC is stupid with likely/unlikely
* annotations: unlikely code is compiled with an extreme emphasis on
* size, up to compiling integer division by constants to actual div
* instructions. In turn, we want likely annotations because we only
* get a nice ladder of forward conditional jumps when there is no
* code between if blocks. We convince GCC that our "special" cases
* for shorter integers aren't slowpathed guards by marking each
* conditional as likely.
*
* The constants are easily proven correct (or compared with those
* generated by a reference compiler, e.g., GCC or clang). For
* example,
*
* 1/10000 ~= k = 3518437209 / 2**45 = 1/10000 + 73/21990232555520000.
*
* Let eps = 73/21990232555520000; for any 0 <= x < 2**32,
* floor(k * x) <= floor(x / 10000 + x * eps)
* <= floor(x / 10000 + 2**32 * eps)
* <= floor(x / 10000 + 2e-5).
*
* Given that x is unsigned, flooring the left and right -hand sides
* will yield the same value as long as the error term
* (x * eps <= 2e-5) is less than 1/10000, and 2e-5 < 10000. We finally
* conclude that 3518437209 / 2**45, our fixed point approximation of
* 1/10000, is always correct for truncated division of 32 bit
* unsigned ints.
*/
/**
* Divide a 32 bit int by 1e4.
*/
INLINE uint32_t
idiv_PO4(uint32_t x)
{
uint64_t wide = x;
uint64_t mul = 3518437209UL;
return (wide * mul) >> 45;
}
/**
* Divide a 32 bit int by 1e8.
*/
INLINE uint64_t
idiv_PO8(uint32_t x)
{
uint64_t wide = x;
uint64_t mul = 1441151881UL;
return (wide * mul) >> 57;
}
char *
an_itoa(char *out, uint32_t x)
{
uint64_t buf;
uint32_t x_div_PO4;
uint32_t x_mod_PO4;
uint32_t x_div_PO8;
/*
* Smaller numbers can be encoded more quickly. Special
* casing them makes a significant difference compared to
* always going through 8-digit encoding.
*/
if (AN_CC_LIKELY(x < PO2)) {
return itoa_hundred(out, x);
}
if (AN_CC_LIKELY(x < PO4)) {
return itoa_ten_thousand(out, x);
}
/*
* Manual souped up common subexpression elimination.
*
* The sequel always needs x / PO4 and x % PO4. Compute them
* here, before branching. We may also need x / PO8 if
* x >= PO8. Benchmarking shows that performing this division
* by constant unconditionally doesn't hurt. If x >= PO8, we'll
* always want x_div_PO4 = (x % PO8) / PO4. We compute that
* in a roundabout manner to reduce the makespan, i.e., the
* length of the dependency chain for (x % PO8) % PO4 = x % PO4.
*/
x_div_PO4 = idiv_PO4(x);
x_mod_PO4 = x - x_div_PO4 * PO4;
x_div_PO8 = idiv_PO8(x);
/*
* We actually want x_div_PO4 = (x % PO8) / PO4.
* Subtract what would have been removed by (x % PO8) from
* x_div_PO4.
*/
x_div_PO4 -= x_div_PO8 * PO4;
/*
* Finally, we can unconditionally encode_ten_thousands the
* values we obtain after division by PO8 and fixup by
* x_div_PO8 * PO4.
*/
buf = encode_ten_thousands(x_div_PO4, x_mod_PO4);
if (AN_CC_LIKELY(x < PO8)) {
unsigned zeros;
zeros = __builtin_ctzll(buf) & -8U;
buf += ZERO_CHARS;
buf >>= zeros;
memcpy(out, &buf, 8);
return out + 8 - zeros / 8;
}
/* 32 bit integers are always below 1e10. */
buf += ZERO_CHARS;
out = itoa_hundred(out, x_div_PO8);
memcpy(out, &buf, 8);
return out + 8;
}
/**
* 64 bit helpers for truncation by constant.
*/
/**
* Divide a 64 bit int by 1e4.
*/
INLINE uint64_t
ldiv_PO4(uint64_t x)
{
uint128_t wide = x;
uint128_t mul = 3777893186295716171ULL;
return (wide * mul) >> 75;
}
/**
* Divide a 64 bit int by 1e8.
*/
INLINE uint64_t
ldiv_PO8(uint64_t x)
{
uint128_t wide = x;
uint128_t mul = 12379400392853802749ULL;
return (wide * mul) >> 90;
}
/**
* Divide a 64 bit int by 1e16.
*/
INLINE uint64_t
ldiv_PO16(uint64_t x)
{
uint128_t wide = x;
uint128_t mul = 4153837486827862103ULL;
return (wide * mul) >> 115;
}
char *
an_ltoa(char *out, uint64_t x)
{
uint64_t x_div_PO4;
uint64_t x_mod_PO4;
uint64_t x_div_PO8;
uint64_t buf;
if (AN_CC_LIKELY(x < PO2)) {
return itoa_hundred(out, x);
}
if (AN_CC_LIKELY(x < PO4)) {
return itoa_ten_thousand(out, x);
}
x_div_PO4 = ldiv_PO4(x);
x_mod_PO4 = x - x_div_PO4 * PO4;
/*
* Benchmarking shows the long division by PO8 hurts
* performance for PO4 <= x < PO8. Keep encode_ten_thousands
* conditional for an_ltoa.
*/
if (AN_CC_LIKELY(x < PO8)) {
uint64_t buf;
unsigned zeros;
buf = encode_ten_thousands(x_div_PO4, x_mod_PO4);
zeros = __builtin_ctzll(buf) & -8U;
buf += ZERO_CHARS;
buf >>= zeros;
memcpy(out, &buf, 8);
return out + 8 - zeros / 8;
}
/* See block comment in an_itoa. */
x_div_PO8 = ldiv_PO8(x);
x_div_PO4 = x_div_PO4 - x_div_PO8 * PO4;
buf = encode_ten_thousands(x_div_PO4, x_mod_PO4) + ZERO_CHARS;
/*
* Add a case for PO8 <= x < PO10 because itoa_hundred is much
* quicker than a second call to encode_ten_thousands; the
* same isn't true of itoa_ten_thousand.
*/
if (AN_CC_LIKELY(x < PO10)) {
out = itoa_hundred(out, x_div_PO8);
memcpy(out, &buf, 8);
return out + 8;
}
/*
* Again, long division by PO16 hurts, so do the rest
* conditionally.
*/
if (AN_CC_LIKELY(x < PO16)) {
uint64_t buf_hi;
uint32_t hi_hi, hi_lo;
unsigned zeros;
/* x_div_PO8 < PO8 < 2**32, so idiv_PO4 is safe. */
hi_hi = idiv_PO4(x_div_PO8);
hi_lo = x_div_PO8 - hi_hi * PO4;
buf_hi = encode_ten_thousands(hi_hi, hi_lo);
zeros = __builtin_ctzll(buf_hi) & -8U;
buf_hi += ZERO_CHARS;
buf_hi >>= zeros;
memcpy(out, &buf_hi, 8);
out += 8 - zeros / 8;
memcpy(out, &buf, 8);
return out + 8;
} else {
uint64_t hi = ldiv_PO16(x);
uint64_t mid = x_div_PO8 - hi * PO8;
uint64_t buf_mid;
uint32_t mid_hi, mid_lo;
mid_hi = idiv_PO4(mid);
mid_lo = mid - mid_hi * PO4;
buf_mid = encode_ten_thousands(mid_hi, mid_lo) + ZERO_CHARS;
out = itoa_ten_thousand(out, hi);
memcpy(out, &buf_mid, 8);
memcpy(out + 8, &buf, 8);
return out + 16;
}
}