Skip to content

Commit

Permalink
Faster erf
Browse files Browse the repository at this point in the history
  • Loading branch information
burrbull committed Aug 2, 2022
1 parent 7505b39 commit f7285e1
Show file tree
Hide file tree
Showing 9 changed files with 680 additions and 866 deletions.
450 changes: 239 additions & 211 deletions src/common.rs

Large diffs are not rendered by default.

8 changes: 2 additions & 6 deletions src/f32.rs
Expand Up @@ -436,10 +436,6 @@ impl crate::Sleef for f32 {
}
}

impl BaseType for f32 {
type Base = Self;
}

impl MaskType for f32 {
type Mask = bool;
}
Expand All @@ -451,8 +447,8 @@ impl MulAdd for f32 {
}
}

impl Poly for f32 {
fn c2v(c: Self::Base) -> Self {
impl Poly<Self> for f32 {
fn c2v(c: Self) -> Self {
c
}
}
Expand Down
176 changes: 101 additions & 75 deletions src/f32/u10.rs
Expand Up @@ -627,89 +627,115 @@ fn test_lgammaf() {
test_f_f(lgammaf, rug::Float::ln_gamma, 0.0..=4e36, 1.0);
}

fn dfmla(x: f32, y: Doubled<f32>, z: Doubled<f32>) -> Doubled<f32> {
z + (y * x)
}
fn poly2df_b(x: f32, c1: Doubled<f32>, c0: Doubled<f32>) -> Doubled<f32> {
dfmla(x, c1, c0)
}
fn poly2df(x: f32, c1: f32, c0: Doubled<f32>) -> Doubled<f32> {
dfmla(x, df(c1, 0.), c0)
}
fn poly4df(x: f32, c3: f32, c2: Doubled<f32>, c1: Doubled<f32>, c0: Doubled<f32>) -> Doubled<f32> {
dfmla(x * x, poly2df(x, c3, c2), poly2df_b(x, c1, c0))
}

/// Error function
///
/// The error bound of the returned value is 1.0 ULP.
pub fn erff(mut a: f32) -> f32 {
let s = a;

a = fabsfk(a);
let o0 = a < 1.1;
let o1 = a < 2.4;
let o2 = a < 4.0;
let mut u = if o0 { a * a } else { a };

let t = if o0 {
0.708_929_219_4_e-4_f32
} else if o1 {
-0.179_266_789_9_e-4
pub fn erff(a: f32) -> f32 {
let x = fabsfk(a);
let x2 = x * x;
let x4 = x2 * x2;

let mut t2;
if x < 2.5 {
// Abramowitz and Stegun
let t = f32::poly6(
x,
x2,
x4,
-0.436_044_700_8_e-6,
0.686_751_536_7_e-5,
-0.304_515_670_0_e-4,
0.980_853_656_1_e-4,
0.239_552_391_6_e-3,
0.145_990_154_1_e-3,
);
t2 = poly4df(
x,
t,
df(
0.009_288_344_532_251_358_032_2,
-2.786_374_589_702_533_075_5_e-11,
),
df(
0.042_275_499_552_488_327_026,
1.346_139_928_998_810_605_7_e-09,
),
df(
0.070_523_701_608_180_999_756,
-3.661_630_931_870_736_516_3_e-09,
),
);
t2 = (1.).add_checked(t2 * x);
t2 = t2.square();
t2 = t2.square();
t2 = t2.square();
t2 = t2.square();
t2 = t2.recpre();
} else if x > 4. {
t2 = df(0., 0.);
} else {
-0.949_575_769_5_e-5
let t = f32::poly6(
x,
x2,
x4,
-0.113_001_284_8_e-6,
0.411_527_298_6_e-5,
-0.692_830_435_6_e-4,
0.717_269_256_7_e-3,
-0.513_104_535_6_e-2,
0.270_863_715_6_e-1,
);
t2 = poly4df(
x,
t,
df(
-0.110_643_193_125_724_792_48,
3.705_045_277_722_528_300_7_e-09,
),
df(
-0.631_922_304_630_279_541_02,
-2.020_043_258_507_317_785_9_e-08,
),
df(
-1.129_663_825_035_095_214_8,
2.551_512_019_645_325_925_2_e-08,
),
);
t2 *= x;
t2 = df(expkf(t2), 0.);
}
.mul_add(
u,
if o0 {
-0.776_831_118_9_e-3
} else if o1 {
0.393_763_301_e-3
} else {
0.248_146_592_6_e-3
},
)
.mul_add(
u,
if o0 {
0.515_946_373_3_e-2
} else if o1 {
-0.394_918_117_7_e-2
} else {
-0.291_817_681_9_e-2
},
)
.mul_add(
u,
if o0 {
-0.268_378_127_4_e-1
} else if o1 {
0.244_547_464_e-1

t2 += -1.;

if x < 1e-4 {
t2 = df(
-1.128_379_225_730_895_996_1,
5.863_538_342_219_759_109_7_e-08,
) * x;
}
mulsignf(
if a == 0. {
0.
} else if a.is_infinite() {
1.
} else {
0.205_970_667_3_e-1
-t2.0 - t2.1
},
a,
)
.mul_add(
u,
if o0 {
0.112_831_801_2
} else if o1 {
-0.107_099_615_0
} else {
-0.990_189_984_4_e-1
},
);
let mut d = t.mul_as_doubled(u);
d += if o0 {
df(-0.376_125_876_000_657_465_175_213_237_214, 0.)
} else if o1 {
df(-0.634_588_905_908_410_389_971_210_809_210, 0.)
} else {
df(-0.643_598_050_547_891_613_081_201_721_633, 0.)
};
d *= u;
d += if o0 {
df(0.112_837_916_021_059_138_255_978_217_023_e+1, 0.)
} else if o1 {
df(-0.112_879_855_826_694_507_209_862_753_992_e+1, 0.)
} else {
df(-0.112_461_487_742_845_562_801_052_956_293_e+1, 0.)
};
d *= a;
d = if o0 { d } else { (1.).add_checked(-expk2f(d)) };
u = mulsignf(if o2 { d.0 + d.1 } else { 1. }, s);
if a.is_nan() {
f32::NAN
} else {
u
}
}

#[test]
Expand Down
26 changes: 8 additions & 18 deletions src/f32x.rs
Expand Up @@ -23,22 +23,6 @@ macro_rules! impl_math_f32 {
type I32x = packed_simd::Simd<[i32; $size]>;
type M32x = packed_simd::Simd<[packed_simd::m32; $size]>;

impl BaseType for F32x {
type Base = f32;
}

impl BaseType for U32x {
type Base = u32;
}

impl BaseType for I32x {
type Base = i32;
}
/*
impl BaseType for M32x {
type Base = m32;
}
*/
impl MaskType for F32x {
type Mask = M32x;
}
Expand Down Expand Up @@ -598,12 +582,18 @@ macro_rules! impl_math_f32 {
}
}

impl Poly for F32x {
fn c2v(c: Self::Base) -> Self {
impl Poly<f32> for F32x {
fn c2v(c: f32) -> Self {
F32x::splat(c)
}
}

impl Poly<Self> for F32x {
fn c2v(c: Self) -> Self {
c
}
}

#[inline]
fn vsel_vi2_vf_vf_vi2_vi2(f0: F32x, f1: F32x, x: I32x, y: I32x) -> I32x {
f0.lt(f1).select(x, y)
Expand Down

0 comments on commit f7285e1

Please sign in to comment.