bytecodealliance · abrown · Apr 9, 2024 · Mar 27, 2024 · Apr 8, 2024 · Apr 8, 2024
@@ -31,6 +31,7 @@ use crate::{timing, CompileError};
 use alloc::string::String;
 use alloc::vec::Vec;
 use cranelift_control::ControlPlane;
+use target_lexicon::Architecture;
 
 #[cfg(feature = "souper-harvest")]
 use crate::souper_harvest::do_souper_harvest;
@@ -282,7 +283,15 @@ impl Context {
 
     /// Perform NaN canonicalizing rewrites on the function.
     pub fn canonicalize_nans(&mut self, isa: &dyn TargetIsa) -> CodegenResult<()> {
-        do_nan_canonicalization(&mut self.func);
+        // Currently only RiscV64 is the only arch that may not have vector support.
+        let has_vector_support = match isa.triple().architecture {
+            Architecture::Riscv64(_) => match isa.isa_flags().iter().find(|f| f.name == "has_v") {
+                Some(value) => value.as_bool().unwrap_or(false),
+                None => false,
+            },
+            _ => true,
+        };
+        do_nan_canonicalization(&mut self.func, has_vector_support);
         self.verify_if(isa)
     }
 

@@ -1428,6 +1428,7 @@
 (decl pure partial all_ones_or_all_zeros (Value) bool)
 (rule (all_ones_or_all_zeros (and (icmp _ _ _) (value_type (multi_lane _ _)))) $true)
 (rule (all_ones_or_all_zeros (and (fcmp _ _ _) (value_type (multi_lane _ _)))) $true)
+(rule (all_ones_or_all_zeros (and (bitcast _ (fcmp _ _ _)) (value_type (multi_lane _ _)))) $true)
 (rule (all_ones_or_all_zeros (vconst (vconst_all_ones_or_all_zeros))) $true)
 
 (decl pure vconst_all_ones_or_all_zeros () Constant)

@@ -5,7 +5,7 @@
 use crate::cursor::{Cursor, FuncCursor};
 use crate::ir::condcodes::FloatCC;
 use crate::ir::immediates::{Ieee32, Ieee64};
-use crate::ir::types;
+use crate::ir::types::{self};
 use crate::ir::{Function, Inst, InstBuilder, InstructionData, Opcode, Value};
 use crate::opts::MemFlags;
 use crate::timing;
@@ -15,13 +15,13 @@ static CANON_32BIT_NAN: u32 = 0b01111111110000000000000000000000;
 static CANON_64BIT_NAN: u64 = 0b0111111111111000000000000000000000000000000000000000000000000000;
 
 /// Perform the NaN canonicalization pass.
-pub fn do_nan_canonicalization(func: &mut Function) {
+pub fn do_nan_canonicalization(func: &mut Function, has_vector_support: bool) {
     let _tt = timing::canonicalize_nans();
     let mut pos = FuncCursor::new(func);
     while let Some(_block) = pos.next_block() {
         while let Some(inst) = pos.next_inst() {
             if is_fp_arith(&mut pos, inst) {
-                add_nan_canon_seq(&mut pos, inst);
+                add_nan_canon_seq(&mut pos, inst, has_vector_support);
             }
         }
     }
@@ -57,24 +57,36 @@ fn is_fp_arith(pos: &mut FuncCursor, inst: Inst) -> bool {
 }
 
 /// Append a sequence of canonicalizing instructions after the given instruction.
-fn add_nan_canon_seq(pos: &mut FuncCursor, inst: Inst) {
+fn add_nan_canon_seq(pos: &mut FuncCursor, inst: Inst, has_vector_support: bool) {
     // Select the instruction result, result type. Replace the instruction
     // result and step forward before inserting the canonicalization sequence.
     let val = pos.func.dfg.first_result(inst);
     let val_type = pos.func.dfg.value_type(val);
     let new_res = pos.func.dfg.replace_result(val, val_type);
     let _next_inst = pos.next_inst().expect("block missing terminator!");
 
-    // Insert a comparison instruction, to check if `inst_res` is NaN. Select
-    // the canonical NaN value if `val` is NaN, assign the result to `inst`.
-    let is_nan = pos.ins().fcmp(FloatCC::NotEqual, new_res, new_res);
+    // Insert a comparison instruction, to check if `inst_res` is NaN (comparing
+    // against NaN is always unordered). Select the canonical NaN value if `val`
+    // is NaN, assign the result to `inst`.
+    let comparison = FloatCC::Unordered;
 
+    let vectorized_scalar_select = |pos: &mut FuncCursor, canon_nan: Value, ty: types::Type| {
+        let canon_nan = pos.ins().scalar_to_vector(ty, canon_nan);
+        let new_res = pos.ins().scalar_to_vector(ty, new_res);
+        let is_nan = pos.ins().fcmp(comparison, new_res, new_res);
+        let is_nan = pos.ins().bitcast(ty, MemFlags::new(), is_nan);
+        let simd_result = pos.ins().bitselect(is_nan, canon_nan, new_res);
+        pos.ins().with_result(val).extractlane(simd_result, 0);
+    };
     let scalar_select = |pos: &mut FuncCursor, canon_nan: Value| {
+        let is_nan = pos.ins().fcmp(comparison, new_res, new_res);
         pos.ins()
             .with_result(val)
             .select(is_nan, canon_nan, new_res);
     };
+
     let vector_select = |pos: &mut FuncCursor, canon_nan: Value| {
+        let is_nan = pos.ins().fcmp(comparison, new_res, new_res);
         let is_nan = pos.ins().bitcast(val_type, MemFlags::new(), is_nan);
         pos.ins()
             .with_result(val)
@@ -84,11 +96,19 @@ fn add_nan_canon_seq(pos: &mut FuncCursor, inst: Inst) {
     match val_type {
         types::F32 => {
             let canon_nan = pos.ins().f32const(Ieee32::with_bits(CANON_32BIT_NAN));
-            scalar_select(pos, canon_nan);
+            if has_vector_support {
+                vectorized_scalar_select(pos, canon_nan, types::F32X4);
+            } else {
+                scalar_select(pos, canon_nan);
+            }
         }
         types::F64 => {
             let canon_nan = pos.ins().f64const(Ieee64::with_bits(CANON_64BIT_NAN));
-            scalar_select(pos, canon_nan);
+            if has_vector_support {
+                vectorized_scalar_select(pos, canon_nan, types::F64X2);
+            } else {
+                scalar_select(pos, canon_nan);
+            }
         }
         types::F32X4 => {
             let canon_nan = pos.ins().f32const(Ieee32::with_bits(CANON_32BIT_NAN));

@@ -0,0 +1,140 @@
+test compile precise-output
+set enable_nan_canonicalization=true 
+target x86_64 sse41
+
+function %f0(f32x4, f32x4) -> f32x4 {
+block0(v0: f32x4, v1: f32x4):
+	v2 = fadd v0, v1
+	return v2
+}
+
+; VCode:
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   addps   %xmm0, %xmm1, %xmm0
+;   movl    $2143289344, %r10d
+;   movd    %r10d, %xmm7
+;   shufps  $0, %xmm7, const(0), %xmm7
+;   movdqa  %xmm0, %xmm1
+;   cmpps   $3, %xmm1, %xmm0, %xmm1
+;   movdqa  %xmm0, %xmm2
+;   movdqa  %xmm1, %xmm0
+;   movdqa  %xmm2, %xmm1
+;   pblendvb %xmm1, %xmm7, %xmm1
+;   movdqa  %xmm1, %xmm0
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block1: ; offset 0x4
+;   addps %xmm1, %xmm0
+;   movl $0x7fc00000, %r10d
+;   movd %r10d, %xmm7
+;   shufps $0, 0x26(%rip), %xmm7
+;   movdqa %xmm0, %xmm1
+;   cmpunordps %xmm0, %xmm1
+;   movdqa %xmm0, %xmm2
+;   movdqa %xmm1, %xmm0
+;   movdqa %xmm2, %xmm1
+;   pblendvb %xmm0, %xmm7, %xmm1
+;   movdqa %xmm1, %xmm0
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+;   addb %al, (%rax)
+;   addb %al, (%rax)
+;   addb %al, (%rax)
+;   sarb $0, (%rdi)
+;   addb %al, (%rax)
+;   addb %al, (%rax)
+;   addb %al, (%rax)
+;   addb %al, (%rax)
+;   addb %al, (%rax)
+
+function %f1(f64, f64) -> f64 {
+block0(v0: f64, v1: f64):
+	v2 = fadd v0, v1
+	return v2
+}
+
+; VCode:
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   addsd   %xmm0, %xmm1, %xmm0
+;   movabsq $9221120237041090560, %r9
+;   movq    %r9, %xmm1
+;   movdqa  %xmm0, %xmm7
+;   cmppd   $3, %xmm7, %xmm0, %xmm7
+;   movdqa  %xmm0, %xmm5
+;   movdqa  %xmm7, %xmm0
+;   pblendvb %xmm5, %xmm1, %xmm5
+;   movdqa  %xmm5, %xmm0
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block1: ; offset 0x4
+;   addsd %xmm1, %xmm0
+;   movabsq $0x7ff8000000000000, %r9
+;   movq %r9, %xmm1
+;   movdqa %xmm0, %xmm7
+;   cmpunordpd %xmm0, %xmm7
+;   movdqa %xmm0, %xmm5
+;   movdqa %xmm7, %xmm0
+;   pblendvb %xmm0, %xmm1, %xmm5
+;   movdqa %xmm5, %xmm0
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+
+function %f1(f32, f32) -> f32 {
+block0(v0: f32, v1: f32):
+	v2 = fadd v0, v1
+	return v2
+}
+
+; VCode:
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   addss   %xmm0, %xmm1, %xmm0
+;   movl    $2143289344, %r9d
+;   movd    %r9d, %xmm1
+;   movdqa  %xmm0, %xmm7
+;   cmpps   $3, %xmm7, %xmm0, %xmm7
+;   movdqa  %xmm0, %xmm5
+;   movdqa  %xmm7, %xmm0
+;   pblendvb %xmm5, %xmm1, %xmm5
+;   movdqa  %xmm5, %xmm0
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block1: ; offset 0x4
+;   addss %xmm1, %xmm0
+;   movl $0x7fc00000, %r9d
+;   movd %r9d, %xmm1
+;   movdqa %xmm0, %xmm7
+;   cmpunordps %xmm0, %xmm7
+;   movdqa %xmm0, %xmm5
+;   movdqa %xmm7, %xmm0
+;   pblendvb %xmm0, %xmm1, %xmm5
+;   movdqa %xmm5, %xmm0
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+