Permalink
Switch branches/tags
Nothing to show
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
431 lines (377 sloc) 10.8 KB
#include "an_itoa.h"
#include <string.h>
/*
* We only use functions for clarity here. Eyeballing -S output
* reveals GCC makes strange inlining decisions on this file,
* especially with AN_CC_LIKELY. Make sure our helpers are really
* inlined.
*/
#define INLINE AN_CC_INLINE static
typedef __uint128_t uint128_t;
/* Powers of 10. */
#define PO2 100ULL
#define PO4 10000ULL
#define PO8 100000000ULL
#define PO10 10000000000ULL
#define PO16 10000000000000000ULL
/* 64 bit's worth of '0' characters (i.e., 8 characters). */
#define ZERO_CHARS 0x3030303030303030ULL
/**
* encode_* functions unpack (pairs of) numbers into BCD: each byte
* contains exactly one decimal digit.
*
* The basic idea is to use SWAR (SIMD within a register) and perform
* low-precision arithmetic on several values in parallel.
*
* Most non-obviousness lies in the conversion of integer division
* constants to multiplication, shift and mask by hand. Decent
* compilers do it for scalars, but we can't easily express a SWAR
* integer division.
*
* The trick is to choose a low enough precision that the fixed point
* multiplication won't overflow into the next packed value (and high
* enough that the truncated division is exact for the relevant
* range), and to pack values so that the final result ends up in the
* byte we want. There are formulae to determine how little precision
* we need given an input range and a constant divisor, but for our
* cases, one can also check exhaustively (:
*
* The remainder is simple: given d = x / k, x % k = x - k * d.
*/
/**
* SWAR unpack [100 * hi + lo] to 4 decimal bytes, assuming hi and lo
* \in [0, 100)
*/
INLINE uint32_t
encode_hundreds(uint32_t hi, uint32_t lo)
{
/*
* Pack everything in a single 32 bit value.
*
* merged = [ hi 0 lo 0 ]
*/
uint32_t merged = hi | (lo << 16);
/*
* Fixed-point multiplication by 103/1024 ~= 1/10.
*/
uint32_t tens = (merged * 103UL) >> 10;
/*
* Mask away garbage bits between our digits.
*
* tens = [ hi/10 0 lo/10 0 ]
*
* On a platform with more restricted literals (ARM, for
* instance), it may make sense to and-not with the middle
* bits.
*/
tens &= (0xFUL << 16) | 0xFUL;
/*
* x mod 10 = x - 10 * (x div 10).
*
* (merged - 10 * tens) = [ hi%10 0 lo%10 0 ]
*
* Then insert these values between tens. Arithmetic instead
* of bitwise operation helps the compiler merge this with
* later increments by a constant (e.g., ZERO_CHARS).
*/
return tens + ((merged - 10UL * tens) << 8);
}
/**
* SWAR encode 10000 hi + lo to byte (unpacked) BCD.
*/
INLINE uint64_t
encode_ten_thousands(uint64_t hi, uint64_t lo)
{
uint64_t merged = hi | (lo << 32);
/* Truncate division by 100: 10486 / 2**20 ~= 1/100. */
uint64_t top = ((merged * 10486ULL) >> 20) & ((0x7FULL << 32) | 0x7FULL);
/* Trailing 2 digits in the 1e4 chunks. */
uint64_t bot = merged - 100ULL * top;
uint64_t hundreds;
uint64_t tens;
/*
* We now have 4 radix-100 digits in little-endian order, each
* in its own 16 bit area.
*/
hundreds = (bot << 16) + top;
/* Divide and mod by 10 all 4 radix-100 digits in parallel. */
tens = (hundreds * 103ULL) >> 10;
tens &= (0xFULL << 48) | (0xFULL << 32) | (0xFULL << 16) | 0xFULL;
tens += (hundreds - 10ULL * tens) << 8;
return tens;
}
/**
* Range-specialised version of itoa.
*
* We always convert to fixed-width BCD then shift away any leading
* zero. The slop will manifest as writing zero bytes after our
* encoded string, which is acceptable: we never write more than the
* maximal length (10 or 20 characters).
*/
/**
* itoa for x < 100.
*/
INLINE char *
itoa_hundred(char *out, unsigned int x)
{
/*
* -1 if x < 10, 0 otherwise. Tried to get an sbb, but ?:
* gets us branches.
*/
int small = (int)(x - 10) >> 8;
unsigned int base = (unsigned int)'0' | ((unsigned int)'0' << 8);
/*
* Probably not necessary, but why not abuse smaller constants?
* Also, see block comment above idiv_POx functions.
*/
unsigned int hi = (x * 103UL) >> 10;
unsigned int lo = x - 10UL * hi;
base += hi + (lo << 8);
/* Shift away the leading zero (shift by 8) if x < 10. */
base >>= small & 8;
memcpy(out, &base, 2);
/* 2 + small = 1 if x < 10, 2 otherwise. */
return out + 2 + small;
}
/**
* itoa for x < 10k.
*/
INLINE char *
itoa_ten_thousand(char *out, unsigned int x)
{
uint32_t buf;
unsigned int x_div_PO2, x_mod_PO2;
unsigned int zeros;
x_div_PO2 = (x * 10486UL) >> 20;
x_mod_PO2 = x - PO2 * x_div_PO2;
buf = encode_hundreds(x_div_PO2, x_mod_PO2);
/*
* Count leading (in memory, trailing in register: we're
* little endian) zero bytes: count leading zero bits and
* round down to 8.
*/
zeros = __builtin_ctz(buf) & -8U;
buf += ZERO_CHARS; /* BCD -> ASCII. */
buf >>= zeros; /* Shift away leading zero characters */
memcpy(out, &buf, 4);
/* zeros is in bits; convert to bytes to find actual length. */
return out + 4 - zeros / 8;
}
/**
* 32 bit helpers for truncation by constant.
*
* We only need them because GCC is stupid with likely/unlikely
* annotations: unlikely code is compiled with an extreme emphasis on
* size, up to compiling integer division by constants to actual div
* instructions. In turn, we want likely annotations because we only
* get a nice ladder of forward conditional jumps when there is no
* code between if blocks. We convince GCC that our "special" cases
* for shorter integers aren't slowpathed guards by marking each
* conditional as likely.
*
* The constants are easily proven correct (or compared with those
* generated by a reference compiler, e.g., GCC or clang). For
* example,
*
* 1/10000 ~= k = 3518437209 / 2**45 = 1/10000 + 73/21990232555520000.
*
* Let eps = 73/21990232555520000; for any 0 <= x < 2**32,
* floor(k * x) <= floor(x / 10000 + x * eps)
* <= floor(x / 10000 + 2**32 * eps)
* <= floor(x / 10000 + 2e-5).
*
* Given that x is unsigned, flooring the left and right -hand sides
* will yield the same value as long as the error term
* (x * eps <= 2e-5) is less than 1/10000, and 2e-5 < 10000. We finally
* conclude that 3518437209 / 2**45, our fixed point approximation of
* 1/10000, is always correct for truncated division of 32 bit
* unsigned ints.
*/
/**
* Divide a 32 bit int by 1e4.
*/
INLINE uint32_t
idiv_PO4(uint32_t x)
{
uint64_t wide = x;
uint64_t mul = 3518437209UL;
return (wide * mul) >> 45;
}
/**
* Divide a 32 bit int by 1e8.
*/
INLINE uint64_t
idiv_PO8(uint32_t x)
{
uint64_t wide = x;
uint64_t mul = 1441151881UL;
return (wide * mul) >> 57;
}
char *
an_itoa(char *out, uint32_t x)
{
uint64_t buf;
uint32_t x_div_PO4;
uint32_t x_mod_PO4;
uint32_t x_div_PO8;
/*
* Smaller numbers can be encoded more quickly. Special
* casing them makes a significant difference compared to
* always going through 8-digit encoding.
*/
if (AN_CC_LIKELY(x < PO2)) {
return itoa_hundred(out, x);
}
if (AN_CC_LIKELY(x < PO4)) {
return itoa_ten_thousand(out, x);
}
/*
* Manual souped up common subexpression elimination.
*
* The sequel always needs x / PO4 and x % PO4. Compute them
* here, before branching. We may also need x / PO8 if
* x >= PO8. Benchmarking shows that performing this division
* by constant unconditionally doesn't hurt. If x >= PO8, we'll
* always want x_div_PO4 = (x % PO8) / PO4. We compute that
* in a roundabout manner to reduce the makespan, i.e., the
* length of the dependency chain for (x % PO8) % PO4 = x % PO4.
*/
x_div_PO4 = idiv_PO4(x);
x_mod_PO4 = x - x_div_PO4 * PO4;
x_div_PO8 = idiv_PO8(x);
/*
* We actually want x_div_PO4 = (x % PO8) / PO4.
* Subtract what would have been removed by (x % PO8) from
* x_div_PO4.
*/
x_div_PO4 -= x_div_PO8 * PO4;
/*
* Finally, we can unconditionally encode_ten_thousands the
* values we obtain after division by PO8 and fixup by
* x_div_PO8 * PO4.
*/
buf = encode_ten_thousands(x_div_PO4, x_mod_PO4);
if (AN_CC_LIKELY(x < PO8)) {
unsigned zeros;
zeros = __builtin_ctzll(buf) & -8U;
buf += ZERO_CHARS;
buf >>= zeros;
memcpy(out, &buf, 8);
return out + 8 - zeros / 8;
}
/* 32 bit integers are always below 1e10. */
buf += ZERO_CHARS;
out = itoa_hundred(out, x_div_PO8);
memcpy(out, &buf, 8);
return out + 8;
}
/**
* 64 bit helpers for truncation by constant.
*/
/**
* Divide a 64 bit int by 1e4.
*/
INLINE uint64_t
ldiv_PO4(uint64_t x)
{
uint128_t wide = x;
uint128_t mul = 3777893186295716171ULL;
return (wide * mul) >> 75;
}
/**
* Divide a 64 bit int by 1e8.
*/
INLINE uint64_t
ldiv_PO8(uint64_t x)
{
uint128_t wide = x;
uint128_t mul = 12379400392853802749ULL;
return (wide * mul) >> 90;
}
/**
* Divide a 64 bit int by 1e16.
*/
INLINE uint64_t
ldiv_PO16(uint64_t x)
{
uint128_t wide = x;
uint128_t mul = 4153837486827862103ULL;
return (wide * mul) >> 115;
}
char *
an_ltoa(char *out, uint64_t x)
{
uint64_t x_div_PO4;
uint64_t x_mod_PO4;
uint64_t x_div_PO8;
uint64_t buf;
if (AN_CC_LIKELY(x < PO2)) {
return itoa_hundred(out, x);
}
if (AN_CC_LIKELY(x < PO4)) {
return itoa_ten_thousand(out, x);
}
x_div_PO4 = ldiv_PO4(x);
x_mod_PO4 = x - x_div_PO4 * PO4;
/*
* Benchmarking shows the long division by PO8 hurts
* performance for PO4 <= x < PO8. Keep encode_ten_thousands
* conditional for an_ltoa.
*/
if (AN_CC_LIKELY(x < PO8)) {
uint64_t buf;
unsigned zeros;
buf = encode_ten_thousands(x_div_PO4, x_mod_PO4);
zeros = __builtin_ctzll(buf) & -8U;
buf += ZERO_CHARS;
buf >>= zeros;
memcpy(out, &buf, 8);
return out + 8 - zeros / 8;
}
/* See block comment in an_itoa. */
x_div_PO8 = ldiv_PO8(x);
x_div_PO4 = x_div_PO4 - x_div_PO8 * PO4;
buf = encode_ten_thousands(x_div_PO4, x_mod_PO4) + ZERO_CHARS;
/*
* Add a case for PO8 <= x < PO10 because itoa_hundred is much
* quicker than a second call to encode_ten_thousands; the
* same isn't true of itoa_ten_thousand.
*/
if (AN_CC_LIKELY(x < PO10)) {
out = itoa_hundred(out, x_div_PO8);
memcpy(out, &buf, 8);
return out + 8;
}
/*
* Again, long division by PO16 hurts, so do the rest
* conditionally.
*/
if (AN_CC_LIKELY(x < PO16)) {
uint64_t buf_hi;
uint32_t hi_hi, hi_lo;
unsigned zeros;
/* x_div_PO8 < PO8 < 2**32, so idiv_PO4 is safe. */
hi_hi = idiv_PO4(x_div_PO8);
hi_lo = x_div_PO8 - hi_hi * PO4;
buf_hi = encode_ten_thousands(hi_hi, hi_lo);
zeros = __builtin_ctzll(buf_hi) & -8U;
buf_hi += ZERO_CHARS;
buf_hi >>= zeros;
memcpy(out, &buf_hi, 8);
out += 8 - zeros / 8;
memcpy(out, &buf, 8);
return out + 8;
} else {
uint64_t hi = ldiv_PO16(x);
uint64_t mid = x_div_PO8 - hi * PO8;
uint64_t buf_mid;
uint32_t mid_hi, mid_lo;
mid_hi = idiv_PO4(mid);
mid_lo = mid - mid_hi * PO4;
buf_mid = encode_ten_thousands(mid_hi, mid_lo) + ZERO_CHARS;
out = itoa_ten_thousand(out, hi);
memcpy(out, &buf_mid, 8);
memcpy(out + 8, &buf, 8);
return out + 16;
}
}