Port fcmp to ISLE (AArch64) (#4819)

bytecodealliance · Aug 30, 2022 · 3d9d759 · 3d9d759
1 parent e910b8f
commit 3d9d759
Show file tree

Hide file tree

Showing 9 changed files with 171 additions and 51 deletions.
diff --git a/cranelift/codegen/src/isa/aarch64/inst.isle b/cranelift/codegen/src/isa/aarch64/inst.isle
@@ -2638,6 +2638,11 @@
 ;; TODO: Port lower_splat_const() to ISLE.
 (extern constructor splat_const splat_const)
 
+;; Lower a FloatCC to a Cond.
+(decl fp_cond_code (FloatCC) Cond)
+;; TODO: Port lower_fp_condcode() to ISLE.
+(extern constructor fp_cond_code fp_cond_code)
+
 ;; Generate comparison to zero operator from input condition code
 (decl float_cc_cmp_zero_to_vec_misc_op (FloatCC) VecMisc2)
 (extern constructor float_cc_cmp_zero_to_vec_misc_op float_cc_cmp_zero_to_vec_misc_op)
@@ -2966,3 +2971,100 @@
       (let ((dst WritableReg (temp_writable_reg $I64))
            (_ Unit (emit (abi_stackslot_addr dst stack_slot offset))))
         dst))
+
+;; Helper for emitting instruction sequences to perform a vector comparison.
+
+(decl vec_cmp_vc (Reg Reg VectorSize) Reg)
+(rule (vec_cmp_vc rn rm size)
+      (let ((dst Reg (vec_rrr (VecALUOp.Fcmeq) rn rn size))
+            (tmp Reg (vec_rrr (VecALUOp.Fcmeq) rm rm size))
+            (dst Reg (vec_rrr (VecALUOp.And) dst tmp size)))
+       dst))
+
+(decl vec_cmp (Reg Reg Type Cond) Reg)
+
+;; Floating point Vs / Vc
+(rule (vec_cmp rn rm ty (Cond.Vc))
+      (if (ty_vector_float ty))
+      (vec_cmp_vc rn rm (vector_size ty)))
+(rule (vec_cmp rn rm ty (Cond.Vs))
+      (if (ty_vector_float ty))
+      (let ((tmp Reg (vec_cmp_vc rn rm (vector_size ty))))
+       (vec_misc (VecMisc2.Not) tmp (vector_size ty))))
+
+;; 'Less than' operations are implemented by swapping the order of
+;; operands and using the 'greater than' instructions.
+;; 'Not equal' is implemented with 'equal' and inverting the result.
+
+;; Floating-point
+(rule (vec_cmp rn rm ty (Cond.Eq))
+      (if (ty_vector_float ty))
+      (vec_rrr (VecALUOp.Fcmeq) rn rm (vector_size ty)))
+(rule (vec_cmp rn rm ty (Cond.Ne))
+      (if (ty_vector_float ty))
+      (let ((tmp Reg (vec_rrr (VecALUOp.Fcmeq) rn rm (vector_size ty))))
+       (vec_misc (VecMisc2.Not) tmp (vector_size ty))))
+(rule (vec_cmp rn rm ty (Cond.Ge))
+      (if (ty_vector_float ty))
+      (vec_rrr (VecALUOp.Fcmge) rn rm (vector_size ty)))
+(rule (vec_cmp rn rm ty (Cond.Gt))
+      (if (ty_vector_float ty))
+      (vec_rrr (VecALUOp.Fcmgt) rn rm (vector_size ty)))
+;; Floating-point swapped-operands
+(rule (vec_cmp rn rm ty (Cond.Mi))
+      (if (ty_vector_float ty))
+      (vec_rrr (VecALUOp.Fcmgt) rm rn (vector_size ty)))
+(rule (vec_cmp rn rm ty (Cond.Ls))
+      (if (ty_vector_float ty))
+      (vec_rrr (VecALUOp.Fcmge) rm rn (vector_size ty)))
+
+;; Integer
+(rule (vec_cmp rn rm ty (Cond.Eq))
+      (if (ty_vector_not_float ty))
+      (vec_rrr (VecALUOp.Cmeq) rn rm (vector_size ty)))
+(rule (vec_cmp rn rm ty (Cond.Ne))
+      (if (ty_vector_not_float ty))
+      (let ((tmp Reg (vec_rrr (VecALUOp.Cmeq) rn rm (vector_size ty))))
+       (vec_misc (VecMisc2.Not) tmp (vector_size ty))))
+(rule (vec_cmp rn rm ty (Cond.Ge))
+      (if (ty_vector_not_float ty))
+      (vec_rrr (VecALUOp.Cmge) rn rm (vector_size ty)))
+(rule (vec_cmp rn rm ty (Cond.Gt))
+      (if (ty_vector_not_float ty))
+      (vec_rrr (VecALUOp.Cmgt) rn rm (vector_size ty)))
+(rule (vec_cmp rn rm ty (Cond.Hs))
+      (if (ty_vector_not_float ty))
+      (vec_rrr (VecALUOp.Cmhs) rn rm (vector_size ty)))
+(rule (vec_cmp rn rm ty (Cond.Hi))
+      (if (ty_vector_not_float ty))
+      (vec_rrr (VecALUOp.Cmhi) rn rm (vector_size ty)))
+;; Integer swapped-operands
+(rule (vec_cmp rn rm ty (Cond.Le))
+      (if (ty_vector_not_float ty))
+      (vec_rrr (VecALUOp.Cmge) rm rn (vector_size ty)))
+(rule (vec_cmp rn rm ty (Cond.Lt))
+      (if (ty_vector_not_float ty))
+      (vec_rrr (VecALUOp.Cmgt) rm rn (vector_size ty)))
+(rule (vec_cmp rn rm ty (Cond.Ls))
+      (if (ty_vector_not_float ty))
+      (vec_rrr (VecALUOp.Cmhs) rm rn (vector_size ty)))
+(rule (vec_cmp rn rm ty (Cond.Lo))
+      (if (ty_vector_not_float ty))
+      (vec_rrr (VecALUOp.Cmhi) rm rn (vector_size ty)))
+
+;; Helper for determining if any value in a vector is true.
+;; This operation is implemented by using umaxp to create a scalar value, which
+;; is then compared against zero.
+;;
+;; umaxp vn.4s, vm.4s, vm.4s
+;; mov xm, vn.d[0]
+;; cmp xm, #0
+(decl vanytrue (Reg Type) ProducesFlags)
+(rule (vanytrue src (ty_vec128 ty))
+      (let ((src Reg (vec_rrr (VecALUOp.Umaxp) src src (VectorSize.Size32x4)))
+            (src Reg (mov_from_vec src 0 (ScalarSize.Size64))))
+       (cmp_imm (OperandSize.Size64) src (u8_into_imm12 0))))
+(rule (vanytrue src ty)
+      (if (ty_vec64 ty))
+      (let ((src Reg (mov_from_vec src 0 (ScalarSize.Size64))))
+       (cmp_imm (OperandSize.Size64) src (u8_into_imm12 0))))
diff --git a/cranelift/codegen/src/isa/aarch64/inst/emit.rs b/cranelift/codegen/src/isa/aarch64/inst/emit.rs
@@ -2349,13 +2349,15 @@ impl MachInstEmit for Inst {
             &Inst::VecDupFromFpu { rd, rn, size } => {
                 let rd = allocs.next_writable(rd);
                 let rn = allocs.next(rn);
-                let imm5 = match size {
-                    VectorSize::Size32x4 => 0b00100,
-                    VectorSize::Size64x2 => 0b01000,
+                let q = size.is_128bits() as u32;
+                let imm5 = match size.lane_size() {
+                    ScalarSize::Size32 => 0b00100,
+                    ScalarSize::Size64 => 0b01000,
                     _ => unimplemented!(),
                 };
                 sink.put4(
-                    0b010_01110000_00000_000001_00000_00000
+                    0b000_01110000_00000_000001_00000_00000
+                        | (q << 30)
                         | (imm5 << 16)
                         | (machreg_to_vec(rn) << 5)
                         | machreg_to_vec(rd.to_reg()),

diff --git a/cranelift/codegen/src/isa/aarch64/lower.isle b/cranelift/codegen/src/isa/aarch64/lower.isle
@@ -182,24 +182,9 @@
 
 ;;;; Rules for `vany_true` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
-;; This operation is implemented by using umaxp to create a scalar value, which
-;; is then compared against zero.
-;;
-;; umaxp vn.4s, vm.4s, vm.4s
-;; mov xm, vn.d[0]
-;; cmp xm, #0
-;; cset xm, ne
-(rule (lower (vany_true x @ (value_type (ty_vec128 ty))))
-      (let ((x1 Reg (vec_rrr (VecALUOp.Umaxp) x x (VectorSize.Size32x4)))
-            (x2 Reg (mov_from_vec x1 0 (ScalarSize.Size64))))
-       (with_flags (cmp_imm (OperandSize.Size64) x2 (u8_into_imm12 0))
-                   (materialize_bool_result (ty_bits ty) (Cond.Ne)))))
-
-(rule (lower (vany_true x @ (value_type ty)))
-      (if (ty_vec64 ty))
-      (let ((x1 Reg (mov_from_vec x 0 (ScalarSize.Size64))))
-       (with_flags (cmp_imm (OperandSize.Size64) x1 (u8_into_imm12 0))
-                   (materialize_bool_result (ty_bits ty) (Cond.Ne)))))
+(rule (lower (has_type out_ty (vany_true x @ (value_type in_ty))))
+      (with_flags (vanytrue x in_ty)
+                  (materialize_bool_result (ty_bits out_ty) (Cond.Ne))))
 
 ;;;; Rules for `iadd_pairwise` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
@@ -1704,6 +1689,16 @@
             (vec_size VectorSize (vector_size ty)))
           (value_reg (float_cmp_zero_swap cond rn vec_size))))
 
+(rule (lower (has_type out_ty
+              (fcmp cond x @ (value_type (ty_scalar_float in_ty)) y)))
+      (with_flags (fpu_cmp (scalar_size in_ty) x y)
+                  (materialize_bool_result
+                   (ty_bits out_ty)
+                   (fp_cond_code cond))))
+
+(rule (lower (has_type out_ty (fcmp cond x @ (value_type in_ty) y)))
+      (if (ty_vector_float in_ty))
+      (vec_cmp x y in_ty (fp_cond_code cond)))
 
 ;;;; Rules for `icmp` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
@@ -1879,7 +1874,6 @@
                   (atomic_cas flags addr src1 src2))))
       (atomic_cas_loop addr src1 src2 ty))
 
-
 ;;;; Rules for 'fvdemote' ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 (rule (lower (fvdemote x))
       (fcvtn x (ScalarSize.Size32)))
@@ -2345,20 +2339,20 @@
 ;; uses. Its second output is a flags output only ever meant to
 ;; check for overflow using the
 ;; `backend.unsigned_add_overflow_condition()` condition.
-;; 
+;;
 ;; Note that the CLIF validation will ensure that no flag-setting
 ;; operation comes between this IaddIfcout and its use (e.g., a
 ;; Trapif). Thus, we can rely on implicit communication through the
 ;; processor flags rather than explicitly generating flags into a
 ;; register. We simply use the variant of the add instruction that
 ;; sets flags (`adds`) here.
-;; 
+;;
 ;; Note that the second output (the flags) need not be generated,
 ;; because flags are never materialized into a register; the only
 ;; instructions that can use a value of type `iflags` or `fflags`
 ;; will look directly for the flags-producing instruction (which can
 ;; always be found, by construction) and merge it.
-;; 
+;;
 ;; Now handle the iadd as above, except use an AddS opcode that sets
 ;; flags.
 

diff --git a/cranelift/codegen/src/isa/aarch64/lower/isle.rs b/cranelift/codegen/src/isa/aarch64/lower/isle.rs
@@ -6,12 +6,14 @@ use generated_code::Context;
 
 // Types that the generated ISLE code uses via `use super::*`.
 use super::{
-    lower_constant_f128, lower_constant_f32, lower_constant_f64, writable_zero_reg, zero_reg,
-    AMode, ASIMDFPModImm, ASIMDMovModImm, BranchTarget, CallIndInfo, CallInfo, Cond, CondBrKind,
-    ExtendOp, FPUOpRI, FloatCC, Imm12, ImmLogic, ImmShift, Inst as MInst, IntCC, JTSequenceInfo,
-    MachLabel, MoveWideConst, MoveWideOp, NarrowValueMode, Opcode, OperandSize, PairAMode, Reg,
-    ScalarSize, ShiftOpAndAmt, UImm5, VecMisc2, VectorSize, NZCV,
+    lower_constant_f128, lower_constant_f32, lower_constant_f64, lower_fp_condcode,
+    writable_zero_reg, zero_reg, AMode, ASIMDFPModImm, ASIMDMovModImm, BranchTarget, CallIndInfo,
+    CallInfo, Cond, CondBrKind, ExtendOp, FPUOpRI, FloatCC, Imm12, ImmLogic, ImmShift,
+    Inst as MInst, IntCC, JTSequenceInfo, MachLabel, MoveWideConst, MoveWideOp, NarrowValueMode,
+    Opcode, OperandSize, PairAMode, Reg, ScalarSize, ShiftOpAndAmt, UImm5, VecMisc2, VectorSize,
+    NZCV,
 };
+use crate::ir::condcodes;
 use crate::isa::aarch64::inst::{FPULeftShiftImm, FPURightShiftImm};
 use crate::isa::aarch64::lower::{lower_address, lower_pair_address, lower_splat_const};
 use crate::isa::aarch64::settings::Flags as IsaFlags;
@@ -520,6 +522,10 @@ impl Context for IsleContext<'_, '_, MInst, Flags, IsaFlags, 6> {
         rd.to_reg()
     }
 
+    fn fp_cond_code(&mut self, cc: &condcodes::FloatCC) -> Cond {
+        lower_fp_condcode(*cc)
+    }
+
     fn preg_sp(&mut self) -> PReg {
         super::regs::stack_reg().to_real_reg().unwrap().into()
     }

diff --git a/cranelift/codegen/src/isa/aarch64/lower_inst.rs b/cranelift/codegen/src/isa/aarch64/lower_inst.rs
@@ -299,25 +299,7 @@ pub(crate) fn lower_insn_to_regs(
             lower_icmp(ctx, insn, condcode, IcmpOutput::Register(rd))?;
         }
 
-        Opcode::Fcmp => {
-            let condcode = ctx.data(insn).fp_cond_code().unwrap();
-            let cond = lower_fp_condcode(condcode);
-            let ty = ctx.input_ty(insn, 0);
-            let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
-            let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
-            let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
-
-            if !ty.is_vector() {
-                ctx.emit(Inst::FpuCmp {
-                    size: ScalarSize::from_ty(ty),
-                    rn,
-                    rm,
-                });
-                materialize_bool_result(ctx, insn, rd, cond);
-            } else {
-                lower_vector_compare(ctx, rd, rn, rm, ty, cond)?;
-            }
-        }
+        Opcode::Fcmp => implemented_in_isle(ctx),
 
         Opcode::Debugtrap => implemented_in_isle(ctx),
 

diff --git a/cranelift/codegen/src/machinst/isle.rs b/cranelift/codegen/src/machinst/isle.rs
@@ -397,6 +397,23 @@ macro_rules! isle_prelude_methods {
             }
         }
 
+        fn ty_vector_float(&mut self, ty: Type) -> Option<Type> {
+            if ty.is_vector() && ty.lane_type().is_float() {
+                Some(ty)
+            } else {
+                None
+            }
+        }
+
+        #[inline]
+        fn ty_vector_not_float(&mut self, ty: Type) -> Option<Type> {
+            if ty.is_vector() && !ty.lane_type().is_float() {
+                Some(ty)
+            } else {
+                None
+            }
+        }
+
         #[inline]
         fn ty_vec64_ctor(&mut self, ty: Type) -> Option<Type> {
             if ty.is_vector() && ty.bits() == 64 {

diff --git a/cranelift/codegen/src/prelude.isle b/cranelift/codegen/src/prelude.isle
@@ -381,6 +381,15 @@
 (decl ty_float_or_vec (Type) Type)
 (extern extractor ty_float_or_vec ty_float_or_vec)
 
+;; A pure constructor that only matches vector floating-point types.
+(decl pure ty_vector_float (Type) Type)
+(extern constructor ty_vector_float ty_vector_float)
+
+;; A pure constructor that only matches vector types with lanes which
+;; are not floating-point.
+(decl pure ty_vector_not_float (Type) Type)
+(extern constructor ty_vector_not_float ty_vector_not_float)
+
 ;; A pure constructor/extractor that only matches 64-bit vector types.
 (decl pure ty_vec64 (Type) Type)
 (extern constructor ty_vec64 ty_vec64_ctor)

diff --git a/cranelift/filetests/filetests/isa/aarch64/simd-logical-compile.clif b/cranelift/filetests/filetests/isa/aarch64/simd-logical-compile.clif
@@ -22,7 +22,7 @@ block0(v0: b32x4):
 ;   umaxp v3.4s, v0.4s, v0.4s
 ;   mov x5, v3.d[0]
 ;   subs xzr, x5, #0
-;   csetm x0, ne
+;   cset x0, ne
 ;   ret
 
 function %vall_true_i64x2(i64x2) -> b1 {

diff --git a/cranelift/filetests/filetests/runtests/simd-conversion.clif b/cranelift/filetests/filetests/runtests/simd-conversion.clif
@@ -39,3 +39,11 @@ block0(v0:f32x4):
 ; run: %fcvt_to_uint_sat([-0x8.1 -0x0.0 0x0.0 -0x1.0p100]) == [0 0 0 0]
 ; run: %fcvt_to_uint_sat([0xB2D05E00.0 0.0 0.0 0.0]) == [3000000000 0 0 0]
 ; run: %fcvt_to_uint_sat([+NaN +NaN +NaN +NaN]) == [0 0 0 0]
+
+function %fcvt_low_from_sint(i32x4) -> f64x2 {
+block0(v0: i32x4):
+    v1 = fcvt_low_from_sint.f64x2 v0
+    return v1
+}
+; run: %fcvt_low_from_sint([0 1 -1 65535]) == [0x0.0 0x1.0]
+; run: %fcvt_low_from_sint([-1 123456789 0 1]) == [-0x1.0 0x1.d6f3454p26]