Skip to content

Commit

Permalink
Added new gamma functions for x86
Browse files Browse the repository at this point in the history
  • Loading branch information
awxkee committed Jun 16, 2024
1 parent 1b3b686 commit 583a9fd
Show file tree
Hide file tree
Showing 7 changed files with 75 additions and 61 deletions.
4 changes: 2 additions & 2 deletions src/app/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ fn main() {
store_stride as u32,
width,
height,
TransferFunction::Srgb,
TransferFunction::Gamma2p8,
);
let elapsed_time = start_time.elapsed();
// Print the elapsed time in milliseconds
Expand Down Expand Up @@ -100,7 +100,7 @@ fn main() {
src_stride,
width,
height,
TransferFunction::Srgb
TransferFunction::Gamma2p8
);

let elapsed_time = start_time.elapsed();
Expand Down
32 changes: 7 additions & 25 deletions src/avx/avx2_to_xyz_lab.rs
Original file line number Diff line number Diff line change
@@ -1,32 +1,19 @@
use crate::avx::avx_gamma_curves::{avx2_rec709_to_linear, avx2_srgb_to_linear};
#[cfg(target_arch = "x86")]
use std::arch::x86::*;
#[cfg(target_arch = "x86_64")]
use std::arch::x86_64::*;

use crate::avx::*;
use crate::avx::avx_gamma_curves::get_avx2_linear_transfer;
#[allow(unused_imports)]
use crate::gamma_curves::TransferFunction;
#[allow(unused_imports)]
use crate::image::ImageConfiguration;
#[allow(unused_imports)]
use crate::image_to_xyz_lab::XyzTarget;
use crate::luv::{LUV_CUTOFF_FORWARD_Y, LUV_MULTIPLIER_FORWARD_Y};
#[cfg(target_arch = "x86")]
use std::arch::x86::*;
#[cfg(target_arch = "x86_64")]
use std::arch::x86_64::*;

#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
#[inline(always)]
#[allow(dead_code)]
pub unsafe fn get_avx2_linear_transfer(
transfer_function: TransferFunction,
) -> unsafe fn(__m256) -> __m256 {
match transfer_function {
TransferFunction::Srgb => avx2_srgb_to_linear,
TransferFunction::Rec709 => avx2_rec709_to_linear,
}
}

#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
#[inline(always)]
#[allow(dead_code)]
unsafe fn avx2_triple_to_xyz(
r: __m256i,
g: __m256i,
Expand Down Expand Up @@ -56,7 +43,6 @@ unsafe fn avx2_triple_to_xyz(
(x, y, z)
}

#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
#[inline(always)]
pub(crate) unsafe fn avx2_triple_to_luv(
x: __m256,
Expand Down Expand Up @@ -87,9 +73,7 @@ pub(crate) unsafe fn avx2_triple_to_luv(
(l, u, v)
}

#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
#[inline(always)]
#[allow(dead_code)]
unsafe fn avx2_triple_to_lab(x: __m256, y: __m256, z: __m256) -> (__m256, __m256, __m256) {
let x = _mm256_mul_ps(x, _mm256_set1_ps(100f32 / 95.047f32));
let y = _mm256_mul_ps(y, _mm256_set1_ps(100f32 / 100f32));
Expand All @@ -112,10 +96,8 @@ unsafe fn avx2_triple_to_lab(x: __m256, y: __m256, z: __m256) -> (__m256, __m256
(l, a, b)
}

#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
#[inline(always)]
#[allow(dead_code)]
pub(crate) unsafe fn avx2_channels_to_xyz_or_lab<
pub unsafe fn avx2_image_to_xyz_lab<
const CHANNELS_CONFIGURATION: u8,
const USE_ALPHA: bool,
const TARGET: u8,
Expand Down
47 changes: 38 additions & 9 deletions src/avx/avx_gamma_curves.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@ use std::arch::x86::*;
#[cfg(target_arch = "x86_64")]
use std::arch::x86_64::*;

#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
#[inline(always)]
pub unsafe fn avx2_srgb_from_linear(linear: __m256) -> __m256 {
let low_cut_off = _mm256_set1_ps(0.0030412825601275209f32);
Expand All @@ -26,7 +25,6 @@ pub unsafe fn avx2_srgb_from_linear(linear: __m256) -> __m256 {
return _mm256_select_ps(mask, high, low);
}

#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
#[inline(always)]
pub unsafe fn avx2_srgb_to_linear(gamma: __m256) -> __m256 {
let low_cut_off = _mm256_set1_ps(12.92f32 * 0.0030412825601275209f32);
Expand All @@ -44,7 +42,6 @@ pub unsafe fn avx2_srgb_to_linear(gamma: __m256) -> __m256 {
return _mm256_select_ps(mask, high, low);
}

#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
#[inline(always)]
pub unsafe fn avx2_rec709_from_linear(linear: __m256) -> __m256 {
let low_cut_off = _mm256_set1_ps(0.018053968510807f32);
Expand All @@ -64,16 +61,15 @@ pub unsafe fn avx2_rec709_from_linear(linear: __m256) -> __m256 {
return _mm256_select_ps(mask, high, low);
}

#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
#[inline(always)]
pub unsafe fn avx2_rec709_to_linear(linear: __m256) -> __m256 {
pub unsafe fn avx2_rec709_to_linear(gamma: __m256) -> __m256 {
let low_cut_off = _mm256_set1_ps(4.5f32 * 0.018053968510807f32);
let mask = _mm256_cmp_ps::<_CMP_GE_OS>(linear, low_cut_off);
let mask = _mm256_cmp_ps::<_CMP_GE_OS>(gamma, low_cut_off);

let mut low = linear;
let mut low = gamma;
let high = _mm256_pow_n_ps(
_mm256_mul_ps(
_mm256_add_ps(linear, _mm256_set1_ps(0.09929682680944f32)),
_mm256_add_ps(gamma, _mm256_set1_ps(0.09929682680944f32)),
_mm256_set1_ps(1f32 / 1.09929682680944f32),
),
1.0f32 / 0.45f32,
Expand All @@ -82,13 +78,46 @@ pub unsafe fn avx2_rec709_to_linear(linear: __m256) -> __m256 {
return _mm256_select_ps(mask, high, low);
}

#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
#[inline(always)]
pub unsafe fn avx2_gamma2p2_to_linear(gamma: __m256) -> __m256 {
_mm256_pow_n_ps(gamma, 2.2f32)
}

#[inline(always)]
pub unsafe fn avx2_gamma2p8_to_linear(gamma: __m256) -> __m256 {
_mm256_pow_n_ps(gamma, 2.8f32)
}

#[inline(always)]
pub unsafe fn avx2_gamma2p2_from_linear(linear: __m256) -> __m256 {
_mm256_pow_n_ps(linear, 1f32 / 2.2f32)
}

#[inline(always)]
pub unsafe fn avx2_gamma2p8_from_linear(linear: __m256) -> __m256 {
_mm256_pow_n_ps(linear, 1f32 / 2.8f32)
}

#[inline(always)]
pub unsafe fn get_avx_gamma_transfer(
transfer_function: TransferFunction,
) -> unsafe fn(__m256) -> __m256 {
match transfer_function {
TransferFunction::Srgb => avx2_srgb_from_linear,
TransferFunction::Rec709 => avx2_rec709_from_linear,
TransferFunction::Gamma2p2 => avx2_gamma2p2_from_linear,
TransferFunction::Gamma2p8 => avx2_gamma2p8_from_linear,
}
}

#[inline(always)]
pub unsafe fn get_avx2_linear_transfer(
transfer_function: TransferFunction,
) -> unsafe fn(__m256) -> __m256 {
match transfer_function {
TransferFunction::Srgb => avx2_srgb_to_linear,
TransferFunction::Rec709 => avx2_rec709_to_linear,
TransferFunction::Gamma2p2 => avx2_gamma2p2_to_linear,
TransferFunction::Gamma2p8 => avx2_gamma2p8_to_linear,
}
}
8 changes: 0 additions & 8 deletions src/avx/avx_math.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@ pub unsafe fn _mm256_prefer_fma_ps(a: __m256, b: __m256, c: __m256) -> __m256 {
return _mm256_fmadd_ps(b, c, a);
}

#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
#[inline(always)]
unsafe fn _mm256_taylorpoly_ps(
x: __m256,
Expand All @@ -50,7 +49,6 @@ unsafe fn _mm256_taylorpoly_ps(
return res;
}

#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
#[inline(always)]
pub unsafe fn _mm256_log_ps(v: __m256) -> __m256 {
let const_ln127 = _mm256_set1_epi32(127); // 127
Expand Down Expand Up @@ -79,9 +77,7 @@ pub unsafe fn _mm256_log_ps(v: __m256) -> __m256 {
poly
}

#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
#[inline(always)]
#[allow(dead_code)]
pub unsafe fn _mm256_select_ps(mask: __m256, true_vals: __m256, false_vals: __m256) -> __m256 {
_mm256_blendv_ps(false_vals, true_vals, mask)
}
Expand All @@ -107,9 +103,7 @@ pub unsafe fn _mm256_select_si256(
)
}

#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
#[inline(always)]
#[allow(dead_code)]
pub unsafe fn _mm256_exp_ps(x: __m256) -> __m256 {
let c1 = _mm256_castsi256_ps(_mm256_set1_epi32(0x3f7ffff6)); // x^1: 0x1.ffffecp-1f
let c2 = _mm256_castsi256_ps(_mm256_set1_epi32(0x3efffedb)); // x^2: 0x1.fffdb6p-2f
Expand Down Expand Up @@ -271,9 +265,7 @@ pub(crate) unsafe fn _mm256_neg_epi32(x: __m256i) -> __m256i {
return _mm256_sub_epi32(high, x);
}

#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
#[inline(always)]
#[allow(dead_code)]
/// This is Cube Root using Pow functions,
/// it also precise however due to of inexact nature of power 1/3 result slightly differ
/// from real cbrt with about ULP 3-4, but this is almost 2 times faster than cbrt with real ULP 3.5
Expand Down
6 changes: 3 additions & 3 deletions src/image_to_xyz_lab.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
any(target_arch = "x86_64", target_arch = "x86"),
target_feature = "avx2"
))]
use crate::avx::avx2_channels_to_xyz_or_lab;
use crate::avx::avx2_image_to_xyz_lab;
use crate::gamma_curves::TransferFunction;
use crate::image::ImageConfiguration;
use crate::image_to_xyz_lab::XyzTarget::{LAB, LUV, XYZ};
Expand Down Expand Up @@ -103,7 +103,7 @@ fn channels_to_xyz<const CHANNELS_CONFIGURATION: u8, const USE_ALPHA: bool, cons
unsafe {
if _has_avx2 {
if USE_ALPHA {
cx = avx2_channels_to_xyz_or_lab::<CHANNELS_CONFIGURATION, USE_ALPHA, TARGET>(
cx = avx2_image_to_xyz_lab::<CHANNELS_CONFIGURATION, USE_ALPHA, TARGET>(
cx,
src.as_ptr(),
src_offset,
Expand All @@ -116,7 +116,7 @@ fn channels_to_xyz<const CHANNELS_CONFIGURATION: u8, const USE_ALPHA: bool, cons
transfer_function,
);
} else {
cx = avx2_channels_to_xyz_or_lab::<CHANNELS_CONFIGURATION, USE_ALPHA, TARGET>(
cx = avx2_image_to_xyz_lab::<CHANNELS_CONFIGURATION, USE_ALPHA, TARGET>(
cx,
src.as_ptr(),
src_offset,
Expand Down
25 changes: 25 additions & 0 deletions src/sse/gamma_curves.rs
Original file line number Diff line number Diff line change
Expand Up @@ -77,12 +77,35 @@ pub unsafe fn sse_rec709_to_linear(linear: __m128) -> __m128 {
return _mm_select_ps(mask, high, low);
}

#[inline(always)]
pub unsafe fn sse_gamma2p2_to_linear(gamma: __m128) -> __m128 {
_mm_pow_n_ps(gamma, 2.2f32)
}

#[inline(always)]
pub unsafe fn sse_gamma2p8_to_linear(gamma: __m128) -> __m128 {
_mm_pow_n_ps(gamma, 2.8f32)
}

#[inline(always)]
pub unsafe fn sse_gamma2p2_from_linear(linear: __m128) -> __m128 {
_mm_pow_n_ps(linear, 1f32 / 2.2f32)
}

#[inline(always)]
pub unsafe fn sse_gamma2p8_from_linear(linear: __m128) -> __m128 {
_mm_pow_n_ps(linear, 1f32 / 2.8f32)
}

#[inline(always)]
pub unsafe fn get_sse_linear_transfer(
transfer_function: TransferFunction,
) -> unsafe fn(__m128) -> __m128 {
match transfer_function {
TransferFunction::Srgb => sse_srgb_to_linear,
TransferFunction::Rec709 => sse_rec709_to_linear,
TransferFunction::Gamma2p2 => sse_gamma2p2_to_linear,
TransferFunction::Gamma2p8 => sse_gamma2p8_to_linear,
}
}

Expand All @@ -93,5 +116,7 @@ pub unsafe fn get_sse_gamma_transfer(
match transfer_function {
TransferFunction::Srgb => sse_srgb_from_linear,
TransferFunction::Rec709 => sse_rec709_from_linear,
TransferFunction::Gamma2p2 => sse_gamma2p2_from_linear,
TransferFunction::Gamma2p8 => sse_gamma2p8_from_linear,
}
}
14 changes: 0 additions & 14 deletions src/sse/math.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,25 +9,19 @@ pub unsafe fn _mm_cube_ps(x: __m128) -> __m128 {
_mm_mul_ps(_mm_mul_ps(x, x), x)
}

#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
#[cfg(not(target_feature = "fma"))]
#[inline]
#[allow(dead_code)]
pub unsafe fn _mm_prefer_fma_ps(a: __m128, b: __m128, c: __m128) -> __m128 {
return _mm_add_ps(_mm_mul_ps(b, c), a);
}

#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
#[cfg(target_feature = "fma")]
#[inline]
#[allow(dead_code)]
pub unsafe fn _mm_prefer_fma_ps(a: __m128, b: __m128, c: __m128) -> __m128 {
return _mm_fmadd_ps(b, c, a);
}

#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
#[inline]
#[allow(dead_code)]
unsafe fn _mm_taylorpoly_ps(
x: __m128,
poly0: __m128,
Expand All @@ -49,9 +43,7 @@ unsafe fn _mm_taylorpoly_ps(
return res;
}

#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
#[inline(always)]
#[allow(dead_code)]
pub unsafe fn _mm_log_ps(v: __m128) -> __m128 {
let const_ln127 = _mm_set1_epi32(127); // 127
let const_ln2 = _mm_set1_ps(std::f32::consts::LN_2); // ln(2)
Expand All @@ -76,9 +68,7 @@ pub unsafe fn _mm_log_ps(v: __m128) -> __m128 {
poly
}

#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
#[inline(always)]
#[allow(dead_code)]
pub unsafe fn _mm_select_ps(mask: __m128, true_vals: __m128, false_vals: __m128) -> __m128 {
_mm_blendv_ps(false_vals, true_vals, mask)
}
Expand All @@ -100,9 +90,7 @@ pub unsafe fn _mm_select_si128(mask: __m128i, true_vals: __m128i, false_vals: __
)
}

#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
#[inline(always)]
#[allow(dead_code)]
pub unsafe fn _mm_exp_ps(x: __m128) -> __m128 {
let c1 = _mm_castsi128_ps(_mm_set1_epi32(0x3f7ffff6)); // x^1: 0x1.ffffecp-1f
let c2 = _mm_castsi128_ps(_mm_set1_epi32(0x3efffedb)); // x^2: 0x1.fffdb6p-2f
Expand Down Expand Up @@ -259,9 +247,7 @@ pub(crate) unsafe fn _mm_neg_ps(x: __m128) -> __m128 {
return _mm_sub_ps(high, x);
}

#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
#[inline(always)]
#[allow(dead_code)]
/// This is Cube Root using Pow functions,
/// it is also precise however due to of inexact nature of power 1/3 result slightly differ
/// from real cbrt with about ULP 3-4, but this is almost 2 times faster than cbrt with real ULP 3.5
Expand Down

0 comments on commit 583a9fd

Please sign in to comment.