riscv64: Implement SIMD shifts, v{all,any}_true and vhigh_bits (#…

…6507) * riscv64: Add SIMD shifts * riscv64: Implement SIMD `vall_true` * riscv64: Implement SIMD `vany_true` * riscv64: Add SIMD `vhigh_bits` * wasmtime: Enable more RISC-V SIMD tests
bytecodealliance · Jun 3, 2023 · f7ae056 · f7ae056
1 parent 176935e
commit f7ae056
Show file tree

Hide file tree

Showing 25 changed files with 5,441 additions and 18 deletions.
diff --git a/build.rs b/build.rs
@@ -237,12 +237,8 @@ fn ignore(testsuite: &str, testname: &str, strategy: &str) -> bool {
                 "almost_extmul",
                 "canonicalize_nan",
                 "cvt_from_uint",
-                "issue4807",
                 "issue_3327_bnot_lowering",
-                "load_splat_out_of_bounds",
                 "simd_align",
-                "simd_bit_shift",
-                "simd_boolean",
                 "simd_conversions",
                 "simd_f32x4",
                 "simd_f32x4_cmp",

diff --git a/cranelift/codegen/src/isa/riscv64/inst.isle b/cranelift/codegen/src/isa/riscv64/inst.isle
@@ -1573,10 +1573,19 @@
 
 ;; UImm5 Helpers
 
+;; Helper to go directly from a `Value`, when it's an `iconst`, to an `UImm5`.
+(decl uimm5_from_value (UImm5) Value)
+(extractor (uimm5_from_value n)
+  (iconst (u64_from_imm64 (uimm5_from_u64 n))))
+
 ;; Extract a `UImm5` from an `u8`.
 (decl pure partial uimm5_from_u8 (UImm5) u8)
 (extern extractor uimm5_from_u8 uimm5_from_u8)
 
+;; Extract a `UImm5` from an `u64`.
+(decl pure partial uimm5_from_u64 (UImm5) u64)
+(extern extractor uimm5_from_u64 uimm5_from_u64)
+
 (decl uimm5_bitcast_to_imm5 (UImm5) Imm5)
 (extern constructor uimm5_bitcast_to_imm5 uimm5_bitcast_to_imm5)
 
@@ -1666,6 +1675,14 @@
   (rv_and (value_regs_get x 0) (value_regs_get y 0)))
 
 
+(decl gen_andi (XReg u64) XReg)
+(rule 1 (gen_andi x (imm12_from_u64 y))
+  (rv_andi x y))
+
+(rule 0 (gen_andi x y)
+  (rv_and x (imm $I64 y)))
+
+
 (decl gen_or (Type ValueRegs ValueRegs) ValueRegs)
 (rule 1 (gen_or $I128 x y)
   (value_regs

diff --git a/cranelift/codegen/src/isa/riscv64/inst/vector.rs b/cranelift/codegen/src/isa/riscv64/inst/vector.rs
@@ -275,12 +275,15 @@ impl VecAluOpRRR {
             VecAluOpRRR::VmulVV => 0b100101,
             VecAluOpRRR::VmulhVV => 0b100111,
             VecAluOpRRR::VmulhuVV | VecAluOpRRR::VfmulVV | VecAluOpRRR::VfmulVF => 0b100100,
+            VecAluOpRRR::VsllVV | VecAluOpRRR::VsllVX => 0b100101,
+            VecAluOpRRR::VsrlVV | VecAluOpRRR::VsrlVX => 0b101000,
+            VecAluOpRRR::VsraVV | VecAluOpRRR::VsraVX => 0b101001,
             VecAluOpRRR::VandVV | VecAluOpRRR::VandVX => 0b001001,
             VecAluOpRRR::VorVV | VecAluOpRRR::VorVX => 0b001010,
             VecAluOpRRR::VxorVV | VecAluOpRRR::VxorVX => 0b001011,
-            VecAluOpRRR::VminuVV | VecAluOpRRR::VminuVX => 0b000100,
+            VecAluOpRRR::VminuVV | VecAluOpRRR::VminuVX | VecAluOpRRR::VredminuVS => 0b000100,
             VecAluOpRRR::VminVV | VecAluOpRRR::VminVX => 0b000101,
-            VecAluOpRRR::VmaxuVV | VecAluOpRRR::VmaxuVX => 0b000110,
+            VecAluOpRRR::VmaxuVV | VecAluOpRRR::VmaxuVX | VecAluOpRRR::VredmaxuVS => 0b000110,
             VecAluOpRRR::VmaxVV | VecAluOpRRR::VmaxVX => 0b000111,
             VecAluOpRRR::VslidedownVX => 0b001111,
             VecAluOpRRR::VfrsubVF => 0b100111,
@@ -293,6 +296,7 @@ impl VecAluOpRRR {
             VecAluOpRRR::VssubuVV | VecAluOpRRR::VssubuVX => 0b100010,
             VecAluOpRRR::VssubVV | VecAluOpRRR::VssubVX => 0b100011,
             VecAluOpRRR::VfsgnjnVV => 0b001001,
+            VecAluOpRRR::VmsltVX => 0b011011,
         }
     }
 
@@ -304,6 +308,9 @@ impl VecAluOpRRR {
             | VecAluOpRRR::VsubVV
             | VecAluOpRRR::VssubVV
             | VecAluOpRRR::VssubuVV
+            | VecAluOpRRR::VsllVV
+            | VecAluOpRRR::VsrlVV
+            | VecAluOpRRR::VsraVV
             | VecAluOpRRR::VandVV
             | VecAluOpRRR::VorVV
             | VecAluOpRRR::VxorVV
@@ -312,16 +319,21 @@ impl VecAluOpRRR {
             | VecAluOpRRR::VmaxuVV
             | VecAluOpRRR::VmaxVV
             | VecAluOpRRR::VmergeVVM => VecOpCategory::OPIVV,
-            VecAluOpRRR::VmulVV | VecAluOpRRR::VmulhVV | VecAluOpRRR::VmulhuVV => {
-                VecOpCategory::OPMVV
-            }
+            VecAluOpRRR::VmulVV
+            | VecAluOpRRR::VmulhVV
+            | VecAluOpRRR::VmulhuVV
+            | VecAluOpRRR::VredmaxuVS
+            | VecAluOpRRR::VredminuVS => VecOpCategory::OPMVV,
             VecAluOpRRR::VaddVX
             | VecAluOpRRR::VsaddVX
             | VecAluOpRRR::VsadduVX
             | VecAluOpRRR::VsubVX
             | VecAluOpRRR::VssubVX
             | VecAluOpRRR::VssubuVX
             | VecAluOpRRR::VrsubVX
+            | VecAluOpRRR::VsllVX
+            | VecAluOpRRR::VsrlVX
+            | VecAluOpRRR::VsraVX
             | VecAluOpRRR::VandVX
             | VecAluOpRRR::VorVX
             | VecAluOpRRR::VxorVX
@@ -330,7 +342,8 @@ impl VecAluOpRRR {
             | VecAluOpRRR::VmaxuVX
             | VecAluOpRRR::VmaxVX
             | VecAluOpRRR::VslidedownVX
-            | VecAluOpRRR::VmergeVXM => VecOpCategory::OPIVX,
+            | VecAluOpRRR::VmergeVXM
+            | VecAluOpRRR::VmsltVX => VecOpCategory::OPIVX,
             VecAluOpRRR::VfaddVV
             | VecAluOpRRR::VfsubVV
             | VecAluOpRRR::VfmulVV
@@ -385,6 +398,9 @@ impl VecAluOpRRImm5 {
         match self {
             VecAluOpRRImm5::VaddVI => 0b000000,
             VecAluOpRRImm5::VrsubVI => 0b000011,
+            VecAluOpRRImm5::VsllVI => 0b100101,
+            VecAluOpRRImm5::VsrlVI => 0b101000,
+            VecAluOpRRImm5::VsraVI => 0b101001,
             VecAluOpRRImm5::VandVI => 0b001001,
             VecAluOpRRImm5::VorVI => 0b001010,
             VecAluOpRRImm5::VxorVI => 0b001011,
@@ -399,6 +415,9 @@ impl VecAluOpRRImm5 {
         match self {
             VecAluOpRRImm5::VaddVI
             | VecAluOpRRImm5::VrsubVI
+            | VecAluOpRRImm5::VsllVI
+            | VecAluOpRRImm5::VsrlVI
+            | VecAluOpRRImm5::VsraVI
             | VecAluOpRRImm5::VandVI
             | VecAluOpRRImm5::VorVI
             | VecAluOpRRImm5::VxorVI
@@ -411,7 +430,10 @@ impl VecAluOpRRImm5 {
 
     pub fn imm_is_unsigned(&self) -> bool {
         match self {
-            VecAluOpRRImm5::VslidedownVI => true,
+            VecAluOpRRImm5::VsllVI
+            | VecAluOpRRImm5::VsrlVI
+            | VecAluOpRRImm5::VsraVI
+            | VecAluOpRRImm5::VslidedownVI => true,
             VecAluOpRRImm5::VaddVI
             | VecAluOpRRImm5::VrsubVI
             | VecAluOpRRImm5::VandVI

diff --git a/cranelift/codegen/src/isa/riscv64/inst_vector.isle b/cranelift/codegen/src/isa/riscv64/inst_vector.isle
@@ -99,6 +99,9 @@
   (VmulVV)
   (VmulhVV)
   (VmulhuVV)
+  (VsllVV)
+  (VsrlVV)
+  (VsraVV)
   (VandVV)
   (VorVV)
   (VxorVV)
@@ -112,6 +115,8 @@
   (VfdivVV)
   (VfsgnjnVV)
   (VmergeVVM)
+  (VredmaxuVS)
+  (VredminuVS)
 
   ;; Vector-Scalar Opcodes
   (VaddVX)
@@ -121,6 +126,9 @@
   (VrsubVX)
   (VssubVX)
   (VssubuVX)
+  (VsllVX)
+  (VsrlVX)
+  (VsraVX)
   (VandVX)
   (VorVX)
   (VxorVX)
@@ -137,6 +145,7 @@
   (VfrdivVF)
   (VmergeVXM)
   (VfmergeVFM)
+  (VmsltVX)
 ))
 
 ;; Register-Imm ALU Ops
@@ -146,6 +155,9 @@
   (VsaddVI)
   (VsadduVI)
   (VrsubVI)
+  (VsllVI)
+  (VsrlVI)
+  (VsraVI)
   (VandVI)
   (VorVI)
   (VxorVI)
@@ -388,6 +400,51 @@
 (rule (rv_vmulhu_vv vs2 vs1 mask vstate)
   (vec_alu_rrr (VecAluOpRRR.VmulhuVV) vs2 vs1 mask vstate))
 
+;; Helper for emitting the `sll.vv` instruction.
+(decl rv_vsll_vv (VReg VReg VecOpMasking VState) VReg)
+(rule (rv_vsll_vv vs2 vs1 mask vstate)
+  (vec_alu_rrr (VecAluOpRRR.VsllVV) vs2 vs1 mask vstate))
+
+;; Helper for emitting the `sll.vx` instruction.
+(decl rv_vsll_vx (VReg XReg VecOpMasking VState) VReg)
+(rule (rv_vsll_vx vs2 vs1 mask vstate)
+  (vec_alu_rrr (VecAluOpRRR.VsllVX) vs2 vs1 mask vstate))
+
+;; Helper for emitting the `vsll.vi` instruction.
+(decl rv_vsll_vi (VReg UImm5 VecOpMasking VState) VReg)
+(rule (rv_vsll_vi vs2 imm mask vstate)
+  (vec_alu_rr_uimm5 (VecAluOpRRImm5.VsllVI) vs2 imm mask vstate))
+
+;; Helper for emitting the `srl.vv` instruction.
+(decl rv_vsrl_vv (VReg VReg VecOpMasking VState) VReg)
+(rule (rv_vsrl_vv vs2 vs1 mask vstate)
+  (vec_alu_rrr (VecAluOpRRR.VsrlVV) vs2 vs1 mask vstate))
+
+;; Helper for emitting the `srl.vx` instruction.
+(decl rv_vsrl_vx (VReg XReg VecOpMasking VState) VReg)
+(rule (rv_vsrl_vx vs2 vs1 mask vstate)
+  (vec_alu_rrr (VecAluOpRRR.VsrlVX) vs2 vs1 mask vstate))
+
+;; Helper for emitting the `vsrl.vi` instruction.
+(decl rv_vsrl_vi (VReg UImm5 VecOpMasking VState) VReg)
+(rule (rv_vsrl_vi vs2 imm mask vstate)
+  (vec_alu_rr_uimm5 (VecAluOpRRImm5.VsrlVI) vs2 imm mask vstate))
+
+;; Helper for emitting the `sra.vv` instruction.
+(decl rv_vsra_vv (VReg VReg VecOpMasking VState) VReg)
+(rule (rv_vsra_vv vs2 vs1 mask vstate)
+  (vec_alu_rrr (VecAluOpRRR.VsraVV) vs2 vs1 mask vstate))
+
+;; Helper for emitting the `sra.vx` instruction.
+(decl rv_vsra_vx (VReg XReg VecOpMasking VState) VReg)
+(rule (rv_vsra_vx vs2 vs1 mask vstate)
+  (vec_alu_rrr (VecAluOpRRR.VsraVX) vs2 vs1 mask vstate))
+
+;; Helper for emitting the `vsra.vi` instruction.
+(decl rv_vsra_vi (VReg UImm5 VecOpMasking VState) VReg)
+(rule (rv_vsra_vi vs2 imm mask vstate)
+  (vec_alu_rr_uimm5 (VecAluOpRRImm5.VsraVI) vs2 imm mask vstate))
+
 ;; Helper for emitting the `vand.vv` instruction.
 (decl rv_vand_vv (VReg VReg VecOpMasking VState) VReg)
 (rule (rv_vand_vv vs2 vs1 mask vstate)
@@ -647,6 +704,25 @@
   (vec_alu_rr_imm5 (VecAluOpRRImm5.VmergeVIM) vs2 imm (masked mask) vstate))
 
 
+;; Helper for emitting the `vredminu.vs` instruction.
+;;
+;; vd[0] = minu( vs1[0] , vs2[*] )
+(decl rv_vredminu_vs (VReg VReg VecOpMasking VState) VReg)
+(rule (rv_vredminu_vs vs2 vs1 mask vstate)
+  (vec_alu_rrr (VecAluOpRRR.VredminuVS) vs2 vs1 mask vstate))
+
+;; Helper for emitting the `vredmaxu.vs` instruction.
+;;
+;; vd[0] = maxu( vs1[0] , vs2[*] )
+(decl rv_vredmaxu_vs (VReg VReg VecOpMasking VState) VReg)
+(rule (rv_vredmaxu_vs vs2 vs1 mask vstate)
+  (vec_alu_rrr (VecAluOpRRR.VredmaxuVS) vs2 vs1 mask vstate))
+
+;; Helper for emitting the `vmslt.vx` (Vector Mask Set Less Than) instruction.
+(decl rv_vmslt_vx (VReg XReg VecOpMasking VState) VReg)
+(rule (rv_vmslt_vx vs2 vs1 mask vstate)
+  (vec_alu_rrr (VecAluOpRRR.VmsltVX) vs2 vs1 mask vstate))
+
 ;;;; Multi-Instruction Helpers ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
 (decl gen_extractlane (Type VReg u8) Reg)

diff --git a/cranelift/codegen/src/isa/riscv64/lower.isle b/cranelift/codegen/src/isa/riscv64/lower.isle
@@ -563,6 +563,14 @@
       (gen_select_reg (IntCC.UnsignedGreaterThanOrEqual) shamt_128 const64 (zero_reg) low)
       (gen_select_reg (IntCC.UnsignedGreaterThanOrEqual) shamt_128 const64 low high))))
 
+;; SIMD Cases
+;; We don't need to mask anything since it is done by the instruction according to SEW.
+
+(rule 5 (lower (has_type (ty_vec_fits_in_register ty) (ishl x y)))
+  (rv_vsll_vx x (value_regs_get y 0) (unmasked) ty))
+
+(rule 6 (lower (has_type (ty_vec_fits_in_register ty) (ishl x (maybe_uextend (uimm5_from_value y)))))
+  (rv_vsll_vi x y (unmasked) ty))
 
 ;;;; Rules for `ushr` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
@@ -609,6 +617,14 @@
       (gen_select_reg (IntCC.UnsignedGreaterThanOrEqual) shamt_128 const64 high low)
       (gen_select_reg (IntCC.UnsignedGreaterThanOrEqual) shamt_128 const64 (zero_reg) high))))
 
+;; SIMD Cases
+;; We don't need to mask or extend anything since it is done by the instruction according to SEW.
+
+(rule 4 (lower (has_type (ty_vec_fits_in_register ty) (ushr x y)))
+  (rv_vsrl_vx x (value_regs_get y 0) (unmasked) ty))
+
+(rule 5 (lower (has_type (ty_vec_fits_in_register ty) (ushr x (maybe_uextend (uimm5_from_value y)))))
+  (rv_vsrl_vi x y (unmasked) ty))
 
 ;;;; Rules for `sshr` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
@@ -660,6 +676,15 @@
       (gen_select_reg (IntCC.UnsignedGreaterThanOrEqual) shamt_128 const64 high low)
       (gen_select_reg (IntCC.UnsignedGreaterThanOrEqual) shamt_128 const64 high_replacement high))))
 
+;; SIMD Cases
+;; We don't need to mask or extend anything since it is done by the instruction according to SEW.
+
+(rule 4 (lower (has_type (ty_vec_fits_in_register ty) (sshr x y)))
+  (rv_vsra_vx x (value_regs_get y 0) (unmasked) ty))
+
+(rule 5 (lower (has_type (ty_vec_fits_in_register ty) (sshr x (maybe_uextend (uimm5_from_value y)))))
+  (rv_vsra_vi x y (unmasked) ty))
+
 
 ;;;; Rules for `rotl` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 (rule (lower (has_type (fits_in_64 ty) (rotl x y)))
@@ -1334,3 +1359,51 @@
 
 (rule 1 (lower (has_type (ty_vec_fits_in_register ty) (ssub_sat x (splat y))))
   (rv_vssub_vx x y (unmasked) ty))
+
+;;;; Rules for `vall_true` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; Here we do a Vector Reduce operation. Get the unsigned minimum value of any
+;; lane in the vector. The fixed input to the reduce operation is a 1.
+;; This way, if any lane is 0, the result will be 0. Otherwise, the result will
+;; be a 1.
+;; The reduce operation leaves the result in the lowest lane, we then move it
+;; into the destination X register.
+(rule (lower (vall_true x @ (value_type (ty_vec_fits_in_register ty))))
+  (if-let one (imm5_from_i8 1))
+  ;; We don't need to broadcast the immediate into all lanes, only into lane 0.
+  ;; I did it this way since it uses one less instruction than with a vmv.s.x.
+  (let ((fixed VReg (rv_vmv_vi one ty))
+        (min VReg (rv_vredminu_vs x fixed (unmasked) ty)))
+    (rv_vmv_xs min ty)))
+
+
+;;;; Rules for `vany_true` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; Here we do a Vector Reduce operation. Get the unsigned maximum value of the
+;; input vector register. Move the max to an X register, and do a `snez` on it
+;; to ensure its either 1 or 0.
+(rule (lower (vany_true x @ (value_type (ty_vec_fits_in_register ty))))
+  (let ((max VReg (rv_vredmaxu_vs x x (unmasked) ty))
+        (x_max XReg (rv_vmv_xs max ty)))
+    (rv_snez x_max)))
+
+
+;;;; Rules for `vhigh_bits` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; To check if the MSB of a lane is set, we do a `vmslt` with zero, this sets
+;; the mask bit to 1 if the value is negative (MSB 1) and 0 if not. We can then
+;; just move that mask to an X Register.
+;;
+;; We must ensure that the move to the X register has a SEW with enough bits
+;; to hold the full mask. Additionally, in some cases (e.g. i64x2) we are going
+;; to read some tail bits. These are undefined, so we need to further mask them
+;; off.
+(rule (lower (vhigh_bits x @ (value_type (ty_vec_fits_in_register ty))))
+  (let ((mask VReg (rv_vmslt_vx x (zero_reg) (unmasked) ty))
+        ;; Here we only need I64X1, but emit an AVL of 2 since it
+        ;; saves one vector state change in the case of I64X2.
+        ;;
+        ;; TODO: For types that have more lanes than element bits, we can
+        ;; use the original type as a VState and avoid a state change.
+        (x_mask XReg (rv_vmv_xs mask (vstate_from_type $I64X2))))
+    (gen_andi x_mask (ty_lane_mask ty))))
diff --git a/cranelift/codegen/src/isa/riscv64/lower/isle.rs b/cranelift/codegen/src/isa/riscv64/lower/isle.rs
@@ -267,6 +267,10 @@ impl generated_code::Context for RV64IsleContext<'_, '_, MInst, Riscv64Backend>
         UImm5::maybe_from_u8(arg0)
     }
     #[inline]
+    fn uimm5_from_u64(&mut self, arg0: u64) -> Option<UImm5> {
+        arg0.try_into().ok().and_then(UImm5::maybe_from_u8)
+    }
+    #[inline]
     fn writable_zero_reg(&mut self) -> WritableReg {
         writable_zero_reg()
     }