Added new gamma functions for x86

awxkee · Jun 16, 2024 · 583a9fd · 583a9fd
1 parent 1b3b686
commit 583a9fd
Show file tree

Hide file tree

Showing 7 changed files with 75 additions and 61 deletions.
diff --git a/src/app/src/main.rs b/src/app/src/main.rs
@@ -65,7 +65,7 @@ fn main() {
             store_stride as u32,
             width,
             height,
-            TransferFunction::Srgb,
+            TransferFunction::Gamma2p8,
         );
         let elapsed_time = start_time.elapsed();
         // Print the elapsed time in milliseconds
@@ -100,7 +100,7 @@ fn main() {
             src_stride,
             width,
             height,
-            TransferFunction::Srgb
+            TransferFunction::Gamma2p8
         );
 
         let elapsed_time = start_time.elapsed();

diff --git a/src/avx/avx2_to_xyz_lab.rs b/src/avx/avx2_to_xyz_lab.rs
@@ -1,32 +1,19 @@
-use crate::avx::avx_gamma_curves::{avx2_rec709_to_linear, avx2_srgb_to_linear};
+#[cfg(target_arch = "x86")]
+use std::arch::x86::*;
+#[cfg(target_arch = "x86_64")]
+use std::arch::x86_64::*;
+
 use crate::avx::*;
+use crate::avx::avx_gamma_curves::get_avx2_linear_transfer;
 #[allow(unused_imports)]
 use crate::gamma_curves::TransferFunction;
 #[allow(unused_imports)]
 use crate::image::ImageConfiguration;
 #[allow(unused_imports)]
 use crate::image_to_xyz_lab::XyzTarget;
 use crate::luv::{LUV_CUTOFF_FORWARD_Y, LUV_MULTIPLIER_FORWARD_Y};
-#[cfg(target_arch = "x86")]
-use std::arch::x86::*;
-#[cfg(target_arch = "x86_64")]
-use std::arch::x86_64::*;
-
-#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
-#[inline(always)]
-#[allow(dead_code)]
-pub unsafe fn get_avx2_linear_transfer(
-    transfer_function: TransferFunction,
-) -> unsafe fn(__m256) -> __m256 {
-    match transfer_function {
-        TransferFunction::Srgb => avx2_srgb_to_linear,
-        TransferFunction::Rec709 => avx2_rec709_to_linear,
-    }
-}
 
-#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
 #[inline(always)]
-#[allow(dead_code)]
 unsafe fn avx2_triple_to_xyz(
     r: __m256i,
     g: __m256i,
@@ -56,7 +43,6 @@ unsafe fn avx2_triple_to_xyz(
     (x, y, z)
 }
 
-#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
 #[inline(always)]
 pub(crate) unsafe fn avx2_triple_to_luv(
     x: __m256,
@@ -87,9 +73,7 @@ pub(crate) unsafe fn avx2_triple_to_luv(
     (l, u, v)
 }
 
-#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
 #[inline(always)]
-#[allow(dead_code)]
 unsafe fn avx2_triple_to_lab(x: __m256, y: __m256, z: __m256) -> (__m256, __m256, __m256) {
     let x = _mm256_mul_ps(x, _mm256_set1_ps(100f32 / 95.047f32));
     let y = _mm256_mul_ps(y, _mm256_set1_ps(100f32 / 100f32));
@@ -112,10 +96,8 @@ unsafe fn avx2_triple_to_lab(x: __m256, y: __m256, z: __m256) -> (__m256, __m256
     (l, a, b)
 }
 
-#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
 #[inline(always)]
-#[allow(dead_code)]
-pub(crate) unsafe fn avx2_channels_to_xyz_or_lab<
+pub unsafe fn avx2_image_to_xyz_lab<
     const CHANNELS_CONFIGURATION: u8,
     const USE_ALPHA: bool,
     const TARGET: u8,

diff --git a/src/avx/avx_gamma_curves.rs b/src/avx/avx_gamma_curves.rs
@@ -6,7 +6,6 @@ use std::arch::x86::*;
 #[cfg(target_arch = "x86_64")]
 use std::arch::x86_64::*;
 
-#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
 #[inline(always)]
 pub unsafe fn avx2_srgb_from_linear(linear: __m256) -> __m256 {
     let low_cut_off = _mm256_set1_ps(0.0030412825601275209f32);
@@ -26,7 +25,6 @@ pub unsafe fn avx2_srgb_from_linear(linear: __m256) -> __m256 {
     return _mm256_select_ps(mask, high, low);
 }
 
-#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
 #[inline(always)]
 pub unsafe fn avx2_srgb_to_linear(gamma: __m256) -> __m256 {
     let low_cut_off = _mm256_set1_ps(12.92f32 * 0.0030412825601275209f32);
@@ -44,7 +42,6 @@ pub unsafe fn avx2_srgb_to_linear(gamma: __m256) -> __m256 {
     return _mm256_select_ps(mask, high, low);
 }
 
-#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
 #[inline(always)]
 pub unsafe fn avx2_rec709_from_linear(linear: __m256) -> __m256 {
     let low_cut_off = _mm256_set1_ps(0.018053968510807f32);
@@ -64,16 +61,15 @@ pub unsafe fn avx2_rec709_from_linear(linear: __m256) -> __m256 {
     return _mm256_select_ps(mask, high, low);
 }
 
-#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
 #[inline(always)]
-pub unsafe fn avx2_rec709_to_linear(linear: __m256) -> __m256 {
+pub unsafe fn avx2_rec709_to_linear(gamma: __m256) -> __m256 {
     let low_cut_off = _mm256_set1_ps(4.5f32 * 0.018053968510807f32);
-    let mask = _mm256_cmp_ps::<_CMP_GE_OS>(linear, low_cut_off);
+    let mask = _mm256_cmp_ps::<_CMP_GE_OS>(gamma, low_cut_off);
 
-    let mut low = linear;
+    let mut low = gamma;
     let high = _mm256_pow_n_ps(
         _mm256_mul_ps(
-            _mm256_add_ps(linear, _mm256_set1_ps(0.09929682680944f32)),
+            _mm256_add_ps(gamma, _mm256_set1_ps(0.09929682680944f32)),
             _mm256_set1_ps(1f32 / 1.09929682680944f32),
         ),
         1.0f32 / 0.45f32,
@@ -82,13 +78,46 @@ pub unsafe fn avx2_rec709_to_linear(linear: __m256) -> __m256 {
     return _mm256_select_ps(mask, high, low);
 }
 
-#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
+#[inline(always)]
+pub unsafe fn avx2_gamma2p2_to_linear(gamma: __m256) -> __m256 {
+    _mm256_pow_n_ps(gamma, 2.2f32)
+}
+
+#[inline(always)]
+pub unsafe fn avx2_gamma2p8_to_linear(gamma: __m256) -> __m256 {
+    _mm256_pow_n_ps(gamma, 2.8f32)
+}
+
+#[inline(always)]
+pub unsafe fn avx2_gamma2p2_from_linear(linear: __m256) -> __m256 {
+    _mm256_pow_n_ps(linear, 1f32 / 2.2f32)
+}
+
+#[inline(always)]
+pub unsafe fn avx2_gamma2p8_from_linear(linear: __m256) -> __m256 {
+    _mm256_pow_n_ps(linear, 1f32 / 2.8f32)
+}
+
 #[inline(always)]
 pub unsafe fn get_avx_gamma_transfer(
     transfer_function: TransferFunction,
 ) -> unsafe fn(__m256) -> __m256 {
     match transfer_function {
         TransferFunction::Srgb => avx2_srgb_from_linear,
         TransferFunction::Rec709 => avx2_rec709_from_linear,
+        TransferFunction::Gamma2p2 => avx2_gamma2p2_from_linear,
+        TransferFunction::Gamma2p8 => avx2_gamma2p8_from_linear,
+    }
+}
+
+#[inline(always)]
+pub unsafe fn get_avx2_linear_transfer(
+    transfer_function: TransferFunction,
+) -> unsafe fn(__m256) -> __m256 {
+    match transfer_function {
+        TransferFunction::Srgb => avx2_srgb_to_linear,
+        TransferFunction::Rec709 => avx2_rec709_to_linear,
+        TransferFunction::Gamma2p2 => avx2_gamma2p2_to_linear,
+        TransferFunction::Gamma2p8 => avx2_gamma2p8_to_linear,
     }
 }
diff --git a/src/avx/avx_math.rs b/src/avx/avx_math.rs
@@ -23,7 +23,6 @@ pub unsafe fn _mm256_prefer_fma_ps(a: __m256, b: __m256, c: __m256) -> __m256 {
     return _mm256_fmadd_ps(b, c, a);
 }
 
-#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
 #[inline(always)]
 unsafe fn _mm256_taylorpoly_ps(
     x: __m256,
@@ -50,7 +49,6 @@ unsafe fn _mm256_taylorpoly_ps(
     return res;
 }
 
-#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
 #[inline(always)]
 pub unsafe fn _mm256_log_ps(v: __m256) -> __m256 {
     let const_ln127 = _mm256_set1_epi32(127); // 127
@@ -79,9 +77,7 @@ pub unsafe fn _mm256_log_ps(v: __m256) -> __m256 {
     poly
 }
 
-#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
 #[inline(always)]
-#[allow(dead_code)]
 pub unsafe fn _mm256_select_ps(mask: __m256, true_vals: __m256, false_vals: __m256) -> __m256 {
     _mm256_blendv_ps(false_vals, true_vals, mask)
 }
@@ -107,9 +103,7 @@ pub unsafe fn _mm256_select_si256(
     )
 }
 
-#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
 #[inline(always)]
-#[allow(dead_code)]
 pub unsafe fn _mm256_exp_ps(x: __m256) -> __m256 {
     let c1 = _mm256_castsi256_ps(_mm256_set1_epi32(0x3f7ffff6)); // x^1: 0x1.ffffecp-1f
     let c2 = _mm256_castsi256_ps(_mm256_set1_epi32(0x3efffedb)); // x^2: 0x1.fffdb6p-2f
@@ -271,9 +265,7 @@ pub(crate) unsafe fn _mm256_neg_epi32(x: __m256i) -> __m256i {
     return _mm256_sub_epi32(high, x);
 }
 
-#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
 #[inline(always)]
-#[allow(dead_code)]
 /// This is Cube Root using Pow functions,
 /// it also precise however due to of inexact nature of power 1/3 result slightly differ
 /// from real cbrt with about ULP 3-4, but this is almost 2 times faster than cbrt with real ULP 3.5

diff --git a/src/image_to_xyz_lab.rs b/src/image_to_xyz_lab.rs
@@ -2,7 +2,7 @@
     any(target_arch = "x86_64", target_arch = "x86"),
     target_feature = "avx2"
 ))]
-use crate::avx::avx2_channels_to_xyz_or_lab;
+use crate::avx::avx2_image_to_xyz_lab;
 use crate::gamma_curves::TransferFunction;
 use crate::image::ImageConfiguration;
 use crate::image_to_xyz_lab::XyzTarget::{LAB, LUV, XYZ};
@@ -103,7 +103,7 @@ fn channels_to_xyz<const CHANNELS_CONFIGURATION: u8, const USE_ALPHA: bool, cons
         unsafe {
             if _has_avx2 {
                 if USE_ALPHA {
-                    cx = avx2_channels_to_xyz_or_lab::<CHANNELS_CONFIGURATION, USE_ALPHA, TARGET>(
+                    cx = avx2_image_to_xyz_lab::<CHANNELS_CONFIGURATION, USE_ALPHA, TARGET>(
                         cx,
                         src.as_ptr(),
                         src_offset,
@@ -116,7 +116,7 @@ fn channels_to_xyz<const CHANNELS_CONFIGURATION: u8, const USE_ALPHA: bool, cons
                         transfer_function,
                     );
                 } else {
-                    cx = avx2_channels_to_xyz_or_lab::<CHANNELS_CONFIGURATION, USE_ALPHA, TARGET>(
+                    cx = avx2_image_to_xyz_lab::<CHANNELS_CONFIGURATION, USE_ALPHA, TARGET>(
                         cx,
                         src.as_ptr(),
                         src_offset,

diff --git a/src/sse/gamma_curves.rs b/src/sse/gamma_curves.rs
@@ -77,12 +77,35 @@ pub unsafe fn sse_rec709_to_linear(linear: __m128) -> __m128 {
     return _mm_select_ps(mask, high, low);
 }
 
+#[inline(always)]
+pub unsafe fn sse_gamma2p2_to_linear(gamma: __m128) -> __m128 {
+    _mm_pow_n_ps(gamma, 2.2f32)
+}
+
+#[inline(always)]
+pub unsafe fn sse_gamma2p8_to_linear(gamma: __m128) -> __m128 {
+    _mm_pow_n_ps(gamma, 2.8f32)
+}
+
+#[inline(always)]
+pub unsafe fn sse_gamma2p2_from_linear(linear: __m128) -> __m128 {
+    _mm_pow_n_ps(linear, 1f32 / 2.2f32)
+}
+
+#[inline(always)]
+pub unsafe fn sse_gamma2p8_from_linear(linear: __m128) -> __m128 {
+    _mm_pow_n_ps(linear, 1f32 / 2.8f32)
+}
+
+#[inline(always)]
 pub unsafe fn get_sse_linear_transfer(
     transfer_function: TransferFunction,
 ) -> unsafe fn(__m128) -> __m128 {
     match transfer_function {
         TransferFunction::Srgb => sse_srgb_to_linear,
         TransferFunction::Rec709 => sse_rec709_to_linear,
+        TransferFunction::Gamma2p2 => sse_gamma2p2_to_linear,
+        TransferFunction::Gamma2p8 => sse_gamma2p8_to_linear,
     }
 }
 
@@ -93,5 +116,7 @@ pub unsafe fn get_sse_gamma_transfer(
     match transfer_function {
         TransferFunction::Srgb => sse_srgb_from_linear,
         TransferFunction::Rec709 => sse_rec709_from_linear,
+        TransferFunction::Gamma2p2 => sse_gamma2p2_from_linear,
+        TransferFunction::Gamma2p8 => sse_gamma2p8_from_linear,
     }
 }
diff --git a/src/sse/math.rs b/src/sse/math.rs
@@ -9,25 +9,19 @@ pub unsafe fn _mm_cube_ps(x: __m128) -> __m128 {
     _mm_mul_ps(_mm_mul_ps(x, x), x)
 }
 
-#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
 #[cfg(not(target_feature = "fma"))]
 #[inline]
-#[allow(dead_code)]
 pub unsafe fn _mm_prefer_fma_ps(a: __m128, b: __m128, c: __m128) -> __m128 {
     return _mm_add_ps(_mm_mul_ps(b, c), a);
 }
 
-#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
 #[cfg(target_feature = "fma")]
 #[inline]
-#[allow(dead_code)]
 pub unsafe fn _mm_prefer_fma_ps(a: __m128, b: __m128, c: __m128) -> __m128 {
     return _mm_fmadd_ps(b, c, a);
 }
 
-#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
 #[inline]
-#[allow(dead_code)]
 unsafe fn _mm_taylorpoly_ps(
     x: __m128,
     poly0: __m128,
@@ -49,9 +43,7 @@ unsafe fn _mm_taylorpoly_ps(
     return res;
 }
 
-#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
 #[inline(always)]
-#[allow(dead_code)]
 pub unsafe fn _mm_log_ps(v: __m128) -> __m128 {
     let const_ln127 = _mm_set1_epi32(127); // 127
     let const_ln2 = _mm_set1_ps(std::f32::consts::LN_2); // ln(2)
@@ -76,9 +68,7 @@ pub unsafe fn _mm_log_ps(v: __m128) -> __m128 {
     poly
 }
 
-#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
 #[inline(always)]
-#[allow(dead_code)]
 pub unsafe fn _mm_select_ps(mask: __m128, true_vals: __m128, false_vals: __m128) -> __m128 {
     _mm_blendv_ps(false_vals, true_vals, mask)
 }
@@ -100,9 +90,7 @@ pub unsafe fn _mm_select_si128(mask: __m128i, true_vals: __m128i, false_vals: __
     )
 }
 
-#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
 #[inline(always)]
-#[allow(dead_code)]
 pub unsafe fn _mm_exp_ps(x: __m128) -> __m128 {
     let c1 = _mm_castsi128_ps(_mm_set1_epi32(0x3f7ffff6)); // x^1: 0x1.ffffecp-1f
     let c2 = _mm_castsi128_ps(_mm_set1_epi32(0x3efffedb)); // x^2: 0x1.fffdb6p-2f
@@ -259,9 +247,7 @@ pub(crate) unsafe fn _mm_neg_ps(x: __m128) -> __m128 {
     return _mm_sub_ps(high, x);
 }
 
-#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
 #[inline(always)]
-#[allow(dead_code)]
 /// This is Cube Root using Pow functions,
 /// it is also precise however due to of inexact nature of power 1/3 result slightly differ
 /// from real cbrt with about ULP 3-4, but this is almost 2 times faster than cbrt with real ULP 3.5