diff --git a/cranelift/codegen/src/isa/aarch64/inst.isle b/cranelift/codegen/src/isa/aarch64/inst.isle index 53b0e65edaef..f035789c1fca 100644 --- a/cranelift/codegen/src/isa/aarch64/inst.isle +++ b/cranelift/codegen/src/isa/aarch64/inst.isle @@ -553,7 +553,8 @@ (VecDupFromFpu (rd WritableReg) (rn Reg) - (size VectorSize)) + (size VectorSize) + (lane u8)) ;; Duplicate FP immediate to vector. (VecDupFPImm @@ -1390,8 +1391,18 @@ (Addp) ;; Zip vectors (primary) [meaning, high halves] (Zip1) + ;; Zip vectors (secondary) + (Zip2) ;; Signed saturating rounding doubling multiply returning high half (Sqrdmulh) + ;; Unzip vectors (primary) + (Uzp1) + ;; Unzip vectors (secondary) + (Uzp2) + ;; Transpose vectors (primary) + (Trn1) + ;; Transpose vectors (secondary) + (Trn2) )) ;; A Vector ALU operation which modifies a source register. @@ -1420,6 +1431,10 @@ (Fneg) ;; Floating-point square root (Fsqrt) + ;; Reverse elements in 16-bit lanes + (Rev16) + ;; Reverse elements in 32-bit lanes + (Rev32) ;; Reverse elements in 64-bit doublewords (Rev64) ;; Floating-point convert to signed integer, rounding toward zero @@ -1887,10 +1902,10 @@ dst)) ;; Helper for emitting `MInst.VecDupFromFpu` instructions. -(decl vec_dup_from_fpu (Reg VectorSize) Reg) -(rule (vec_dup_from_fpu src size) +(decl vec_dup_from_fpu (Reg VectorSize u8) Reg) +(rule (vec_dup_from_fpu src size lane) (let ((dst WritableReg (temp_writable_reg $I8X16)) - (_ Unit (emit (MInst.VecDupFromFpu dst src size)))) + (_ Unit (emit (MInst.VecDupFromFpu dst src size lane)))) dst)) ;; Helper for emitting `MInst.AluRRImm12` instructions. @@ -2386,6 +2401,14 @@ (decl neg (Reg VectorSize) Reg) (rule (neg x size) (vec_misc (VecMisc2.Neg) x size)) +;; Helper for generating `rev16` instructions. +(decl rev16 (Reg VectorSize) Reg) +(rule (rev16 x size) (vec_misc (VecMisc2.Rev16) x size)) + +;; Helper for generating `rev32` instructions. +(decl rev32 (Reg VectorSize) Reg) +(rule (rev32 x size) (vec_misc (VecMisc2.Rev32) x size)) + ;; Helper for generating `rev64` instructions. (decl rev64 (Reg VectorSize) Reg) (rule (rev64 x size) (vec_misc (VecMisc2.Rev64) x size)) @@ -3767,3 +3790,27 @@ (emit_side_effect (with_flags_side_effect (cmp (OperandSize.Size32) ridx jt_size) (jt_sequence ridx jt_info))))) + +;; Helper for emitting the `uzp1` instruction +(decl vec_uzp1 (Reg Reg VectorSize) Reg) +(rule (vec_uzp1 rn rm size) (vec_rrr (VecALUOp.Uzp1) rn rm size)) + +;; Helper for emitting the `uzp2` instruction +(decl vec_uzp2 (Reg Reg VectorSize) Reg) +(rule (vec_uzp2 rn rm size) (vec_rrr (VecALUOp.Uzp2) rn rm size)) + +;; Helper for emitting the `zip1` instruction +(decl vec_zip1 (Reg Reg VectorSize) Reg) +(rule (vec_zip1 rn rm size) (vec_rrr (VecALUOp.Zip1) rn rm size)) + +;; Helper for emitting the `zip2` instruction +(decl vec_zip2 (Reg Reg VectorSize) Reg) +(rule (vec_zip2 rn rm size) (vec_rrr (VecALUOp.Zip2) rn rm size)) + +;; Helper for emitting the `trn1` instruction +(decl vec_trn1 (Reg Reg VectorSize) Reg) +(rule (vec_trn1 rn rm size) (vec_rrr (VecALUOp.Trn1) rn rm size)) + +;; Helper for emitting the `trn2` instruction +(decl vec_trn2 (Reg Reg VectorSize) Reg) +(rule (vec_trn2 rn rm size) (vec_rrr (VecALUOp.Trn2) rn rm size)) diff --git a/cranelift/codegen/src/isa/aarch64/inst/emit.rs b/cranelift/codegen/src/isa/aarch64/inst/emit.rs index 72b1e1176ba8..2332ff19ba35 100644 --- a/cranelift/codegen/src/isa/aarch64/inst/emit.rs +++ b/cranelift/codegen/src/isa/aarch64/inst/emit.rs @@ -1977,8 +1977,20 @@ impl MachInstEmit for Inst { ); (0b1, 0b11111, enc_size) } + VecMisc2::Rev16 => { + debug_assert_eq!(size, VectorSize::Size8x16); + (0b0, 0b00001, enc_size) + } + VecMisc2::Rev32 => { + debug_assert!(size == VectorSize::Size8x16 || size == VectorSize::Size16x8); + (0b1, 0b00000, enc_size) + } VecMisc2::Rev64 => { - debug_assert_ne!(VectorSize::Size64x2, size); + debug_assert!( + size == VectorSize::Size8x16 + || size == VectorSize::Size16x8 + || size == VectorSize::Size32x4 + ); (0b0, 0b00000, enc_size) } VecMisc2::Fcvtzs => { @@ -2493,13 +2505,27 @@ impl MachInstEmit for Inst { | machreg_to_vec(rd.to_reg()), ); } - &Inst::VecDupFromFpu { rd, rn, size } => { + &Inst::VecDupFromFpu { rd, rn, size, lane } => { let rd = allocs.next_writable(rd); let rn = allocs.next(rn); let q = size.is_128bits() as u32; let imm5 = match size.lane_size() { - ScalarSize::Size32 => 0b00100, - ScalarSize::Size64 => 0b01000, + ScalarSize::Size8 => { + assert!(lane < 16); + 0b00001 | (u32::from(lane) << 1) + } + ScalarSize::Size16 => { + assert!(lane < 8); + 0b00010 | (u32::from(lane) << 2) + } + ScalarSize::Size32 => { + assert!(lane < 4); + 0b00100 | (u32::from(lane) << 3) + } + ScalarSize::Size64 => { + assert!(lane < 2); + 0b01000 | (u32::from(lane) << 4) + } _ => unimplemented!(), }; sink.put4( @@ -2870,6 +2896,7 @@ impl MachInstEmit for Inst { VecALUOp::Fmul => (0b001_01110_00_1, 0b110111), VecALUOp::Addp => (0b000_01110_00_1 | enc_size << 1, 0b101111), VecALUOp::Zip1 => (0b01001110_00_0 | enc_size << 1, 0b001110), + VecALUOp::Zip2 => (0b01001110_00_0 | enc_size << 1, 0b011110), VecALUOp::Sqrdmulh => { debug_assert!( size.lane_size() == ScalarSize::Size16 @@ -2878,6 +2905,10 @@ impl MachInstEmit for Inst { (0b001_01110_00_1 | enc_size << 1, 0b101101) } + VecALUOp::Uzp1 => (0b01001110_00_0 | enc_size << 1, 0b000110), + VecALUOp::Uzp2 => (0b01001110_00_0 | enc_size << 1, 0b010110), + VecALUOp::Trn1 => (0b01001110_00_0 | enc_size << 1, 0b001010), + VecALUOp::Trn2 => (0b01001110_00_0 | enc_size << 1, 0b011010), }; let top11 = if is_float { top11 | size.enc_float_size() << 1 diff --git a/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs b/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs index 0670a3faefdc..e06221836bb5 100644 --- a/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs +++ b/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs @@ -2657,6 +2657,7 @@ fn test_aarch64_binemit() { rd: writable_vreg(14), rn: vreg(19), size: VectorSize::Size32x4, + lane: 0, }, "6E06044E", "dup v14.4s, v19.s[0]", @@ -2666,6 +2667,7 @@ fn test_aarch64_binemit() { rd: writable_vreg(18), rn: vreg(10), size: VectorSize::Size64x2, + lane: 0, }, "5205084E", "dup v18.2d, v10.d[0]", diff --git a/cranelift/codegen/src/isa/aarch64/inst/mod.rs b/cranelift/codegen/src/isa/aarch64/inst/mod.rs index 779782907f79..1d91d08983ea 100644 --- a/cranelift/codegen/src/isa/aarch64/inst/mod.rs +++ b/cranelift/codegen/src/isa/aarch64/inst/mod.rs @@ -2123,9 +2123,9 @@ impl Inst { let rn = pretty_print_ireg(rn, size.operand_size(), allocs); format!("dup {}, {}", rd, rn) } - &Inst::VecDupFromFpu { rd, rn, size } => { + &Inst::VecDupFromFpu { rd, rn, size, lane } => { let rd = pretty_print_vreg_vector(rd.to_reg(), size, allocs); - let rn = pretty_print_vreg_element(rn, 0, size.lane_size(), allocs); + let rn = pretty_print_vreg_element(rn, lane.into(), size.lane_size(), allocs); format!("dup {}, {}", rd, rn) } &Inst::VecDupFPImm { rd, imm, size } => { @@ -2345,7 +2345,12 @@ impl Inst { VecALUOp::Fmul => ("fmul", size), VecALUOp::Addp => ("addp", size), VecALUOp::Zip1 => ("zip1", size), + VecALUOp::Zip2 => ("zip2", size), VecALUOp::Sqrdmulh => ("sqrdmulh", size), + VecALUOp::Uzp1 => ("uzp1", size), + VecALUOp::Uzp2 => ("uzp2", size), + VecALUOp::Trn1 => ("trn1", size), + VecALUOp::Trn2 => ("trn2", size), }; let rd = pretty_print_vreg_vector(rd.to_reg(), size, allocs); let rn = pretty_print_vreg_vector(rn, size, allocs); @@ -2471,6 +2476,8 @@ impl Inst { VecMisc2::Fabs => ("fabs", size, ""), VecMisc2::Fneg => ("fneg", size, ""), VecMisc2::Fsqrt => ("fsqrt", size, ""), + VecMisc2::Rev16 => ("rev16", size, ""), + VecMisc2::Rev32 => ("rev32", size, ""), VecMisc2::Rev64 => ("rev64", size, ""), VecMisc2::Fcvtzs => ("fcvtzs", size, ""), VecMisc2::Fcvtzu => ("fcvtzu", size, ""), diff --git a/cranelift/codegen/src/isa/aarch64/lower.isle b/cranelift/codegen/src/isa/aarch64/lower.isle index edb112447333..f3a3db7039b6 100644 --- a/cranelift/codegen/src/isa/aarch64/lower.isle +++ b/cranelift/codegen/src/isa/aarch64/lower.isle @@ -118,6 +118,118 @@ ;;;; Rules for `shuffle` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; When a single element of one vector is broadcast to all the destination +;; lanes then the `dup` instruction can be used for this operation. Note that +;; for now this only matches lane selection from the first vector `a`, but +;; if necessary in the future rules can be added to select from `b` as well. +(rule 6 (lower (shuffle a b (shuffle_dup8_from_imm n))) + (vec_dup_from_fpu a (VectorSize.Size8x16) n)) +(rule 5 (lower (shuffle a b (shuffle_dup16_from_imm n))) + (vec_dup_from_fpu a (VectorSize.Size16x8) n)) +(rule 4 (lower (shuffle a b (shuffle_dup32_from_imm n))) + (vec_dup_from_fpu a (VectorSize.Size32x4) n)) +(rule 3 (lower (shuffle a b (shuffle_dup64_from_imm n))) + (vec_dup_from_fpu a (VectorSize.Size64x2) n)) + +;; If the `Immediate` specified to the extractor looks like a duplication of the +;; `n`th lane of the first vector of size K-byte lanes, then each extractor +;; returns the `n` value as a `u8` to be used as part of a `vec_dup_from_fpu` +;; instruction. Note that there's a different extractor for each bit-width of +;; lane. +(decl shuffle_dup8_from_imm (u8) Immediate) +(extern extractor shuffle_dup8_from_imm shuffle_dup8_from_imm) +(decl shuffle_dup16_from_imm (u8) Immediate) +(extern extractor shuffle_dup16_from_imm shuffle_dup16_from_imm) +(decl shuffle_dup32_from_imm (u8) Immediate) +(extern extractor shuffle_dup32_from_imm shuffle_dup32_from_imm) +(decl shuffle_dup64_from_imm (u8) Immediate) +(extern extractor shuffle_dup64_from_imm shuffle_dup64_from_imm) + +;; When the shuffle looks like "concatenate `a` and `b` and shift right by n*8 +;; bytes", that's an `ext` instruction. +(rule 2 (lower (shuffle a b (vec_extract_imm4_from_immediate n))) + (vec_extract a b n)) + +;; Attempts to extract `n` from the specified shuffle `Immediate` where each +;; byte of the `Immediate` is a consecutive sequence starting from `n`. This +;; value of `n` is used as part of the `vec_extract` instruction which extracts +;; consecutive bytes from two vectors into one final vector, offset by `n` +;; bytes. +(decl vec_extract_imm4_from_immediate (u8) Immediate) +(extern extractor vec_extract_imm4_from_immediate vec_extract_imm4_from_immediate) + +;; Rules for the `uzp1` and `uzp2` instructions which gather even-numbered lanes +;; or odd-numbered lanes +(rule 1 (lower (shuffle a b (u128_from_immediate 0x1e1c_1a18_1614_1210_0e0c_0a08_0604_0200))) + (vec_uzp1 a b (VectorSize.Size8x16))) +(rule 1 (lower (shuffle a b (u128_from_immediate 0x1f1d_1b19_1715_1311_0f0d_0b09_0705_0301))) + (vec_uzp2 a b (VectorSize.Size8x16))) +(rule 1 (lower (shuffle a b (u128_from_immediate 0x1d1c_1918_1514_1110_0d0c_0908_0504_0100))) + (vec_uzp1 a b (VectorSize.Size16x8))) +(rule 1 (lower (shuffle a b (u128_from_immediate 0x1f1e_1b1a_1716_1312_0f0e_0b0a_0706_0302))) + (vec_uzp2 a b (VectorSize.Size16x8))) +(rule 1 (lower (shuffle a b (u128_from_immediate 0x1b1a1918_13121110_0b0a0908_03020100))) + (vec_uzp1 a b (VectorSize.Size32x4))) +(rule 1 (lower (shuffle a b (u128_from_immediate 0x1f1e1d1c_17161514_0f0e0d0c_07060504))) + (vec_uzp2 a b (VectorSize.Size32x4))) +(rule 1 (lower (shuffle a b (u128_from_immediate 0x1716151413121110_0706050403020100))) + (vec_uzp1 a b (VectorSize.Size64x2))) +(rule 1 (lower (shuffle a b (u128_from_immediate 0x1f1e1d1c1b1a1918_0f0e0d0c0b0a0908))) + (vec_uzp2 a b (VectorSize.Size64x2))) + +;; Rules for the `zip1` and `zip2` instructions which interleave lanes in the +;; low or high halves of the two input vectors. +(rule 1 (lower (shuffle a b (u128_from_immediate 0x1707_1606_1505_1404_1303_1202_1101_1000))) + (vec_zip1 a b (VectorSize.Size8x16))) +(rule 1 (lower (shuffle a b (u128_from_immediate 0x1f0f_1e0e_1d0d_1c0c_1b0b_1a0a_1909_1808))) + (vec_zip2 a b (VectorSize.Size8x16))) +(rule 1 (lower (shuffle a b (u128_from_immediate 0x1716_0706_1514_0504_1312_0302_1110_0100))) + (vec_zip1 a b (VectorSize.Size16x8))) +(rule 1 (lower (shuffle a b (u128_from_immediate 0x1f1e_0f0e_1d1c_0d0c_1b1a_0b0a_1918_0908))) + (vec_zip2 a b (VectorSize.Size16x8))) +(rule 1 (lower (shuffle a b (u128_from_immediate 0x17161514_07060504_13121110_03020100))) + (vec_zip1 a b (VectorSize.Size32x4))) +(rule 1 (lower (shuffle a b (u128_from_immediate 0x1f1e1d1c_0f0e0d0c_1b1a1918_0b0a0908))) + (vec_zip2 a b (VectorSize.Size32x4))) +;; Note that zip1/zip2 for i64x2 vectors is omitted since it's already covered +;; by the i64x2 cases of uzp1/uzp2 above where both zip and uzp have the same +;; semantics for 64-bit lanes. + +;; Rules for the `trn1` and `trn2` instructions which interleave odd or even +;; lanes in the two input vectors. +(rule 1 (lower (shuffle a b (u128_from_immediate 0x1e0e_1c0c_1a0a_1808_1606_1404_1202_1000))) + (vec_trn1 a b (VectorSize.Size8x16))) +(rule 1 (lower (shuffle a b (u128_from_immediate 0x1f0f_1d0d_1b0b_1909_1707_1505_1303_1101))) + (vec_trn2 a b (VectorSize.Size8x16))) +(rule 1 (lower (shuffle a b (u128_from_immediate 0x1d1c_0d0c_1918_0908_1514_0504_1110_0100))) + (vec_trn1 a b (VectorSize.Size16x8))) +(rule 1 (lower (shuffle a b (u128_from_immediate 0x1f1e_0f0e_1b1a_0b0a_1716_0706_1312_0302))) + (vec_trn2 a b (VectorSize.Size16x8))) +(rule 1 (lower (shuffle a b (u128_from_immediate 0x1b1a1918_0b0a0908_13121110_03020100))) + (vec_trn1 a b (VectorSize.Size32x4))) +(rule 1 (lower (shuffle a b (u128_from_immediate 0x1f1e1d1c_0f0e0d0c_17161514_07060504))) + (vec_trn2 a b (VectorSize.Size32x4))) +;; Note that trn1/trn2 for i64x2 vectors is omitted since it's already covered +;; by the i64x2 cases of uzp1/uzp2 above where both trn and uzp have the same +;; semantics for 64-bit lanes. + +;; Rules for the `rev{16,32,64}` instructions where reversals happen at either +;; the byte level, the 16-bit level, or 32-bit level. Note that all of these +;; patterns only match reversals in the first operand, but they can +;; theoretically be extended if necessary to reversals in the second operand. +(rule 1 (lower (shuffle a b (u128_from_immediate 0x0e0f_0c0d_0a0b_0809_0607_0405_0203_0001))) + (rev16 a (VectorSize.Size8x16))) +(rule 1 (lower (shuffle a b (u128_from_immediate 0x0c0d0e0f_08090a0b_04050607_00010203))) + (rev32 a (VectorSize.Size8x16))) +(rule 1 (lower (shuffle a b (u128_from_immediate 0x0d0c0f0e_09080b0a_05040706_01000302))) + (rev32 a (VectorSize.Size16x8))) +(rule 1 (lower (shuffle a b (u128_from_immediate 0x08090a0b0c0d0e0f_0001020304050607))) + (rev64 a (VectorSize.Size8x16))) +(rule 1 (lower (shuffle a b (u128_from_immediate 0x09080b0a0d0c0f0e_0100030205040706))) + (rev64 a (VectorSize.Size16x8))) +(rule 1 (lower (shuffle a b (u128_from_immediate 0x0b0a09080f0e0d0c_0302010007060504))) + (rev64 a (VectorSize.Size32x4))) + (rule (lower (has_type ty (shuffle rn rn2 (u128_from_immediate mask)))) (let ((mask_reg Reg (constant_f128 mask))) (vec_tbl2 rn rn2 mask_reg ty))) @@ -1840,7 +1952,7 @@ (vec_dup x (vector_size ty))) (rule -2 (lower (has_type ty (splat x @ (value_type (ty_scalar_float _))))) - (vec_dup_from_fpu x (vector_size ty))) + (vec_dup_from_fpu x (vector_size ty) 0)) (rule (lower (has_type ty (splat (f32const (u64_from_ieee32 n))))) (splat_const n (vector_size ty))) diff --git a/cranelift/codegen/src/isa/aarch64/lower/isle.rs b/cranelift/codegen/src/isa/aarch64/lower/isle.rs index 57e187142ca6..707243d2a3b2 100644 --- a/cranelift/codegen/src/isa/aarch64/lower/isle.rs +++ b/cranelift/codegen/src/isa/aarch64/lower/isle.rs @@ -742,4 +742,47 @@ impl Context for IsleContext<'_, '_, MInst, AArch64Backend> { ); } } + + fn vec_extract_imm4_from_immediate(&mut self, imm: Immediate) -> Option { + let bytes = self.lower_ctx.get_immediate_data(imm).as_slice(); + + if bytes.windows(2).all(|a| a[0] + 1 == a[1]) && bytes[0] < 16 { + Some(bytes[0]) + } else { + None + } + } + + fn shuffle_dup8_from_imm(&mut self, imm: Immediate) -> Option { + let bytes = self.lower_ctx.get_immediate_data(imm).as_slice(); + if bytes.iter().all(|b| *b == bytes[0]) && bytes[0] < 16 { + Some(bytes[0]) + } else { + None + } + } + fn shuffle_dup16_from_imm(&mut self, imm: Immediate) -> Option { + let (a, b, c, d, e, f, g, h) = self.shuffle16_from_imm(imm)?; + if a == b && b == c && c == d && d == e && e == f && f == g && g == h && a < 8 { + Some(a) + } else { + None + } + } + fn shuffle_dup32_from_imm(&mut self, imm: Immediate) -> Option { + let (a, b, c, d) = self.shuffle32_from_imm(imm)?; + if a == b && b == c && c == d && a < 4 { + Some(a) + } else { + None + } + } + fn shuffle_dup64_from_imm(&mut self, imm: Immediate) -> Option { + let (a, b) = self.shuffle64_from_imm(imm)?; + if a == b && a < 2 { + Some(a) + } else { + None + } + } } diff --git a/cranelift/codegen/src/machinst/isle.rs b/cranelift/codegen/src/machinst/isle.rs index 3eb0db7ea5df..9b6adcab9cb2 100644 --- a/cranelift/codegen/src/machinst/isle.rs +++ b/cranelift/codegen/src/machinst/isle.rs @@ -586,6 +586,17 @@ macro_rules! isle_lower_prelude_methods { self.lower_ctx.gen_return(rets); } + /// Same as `shuffle32_from_imm`, but for 64-bit lane shuffles. + fn shuffle64_from_imm(&mut self, imm: Immediate) -> Option<(u8, u8)> { + use crate::machinst::isle::shuffle_imm_as_le_lane_idx; + + let bytes = self.lower_ctx.get_immediate_data(imm).as_slice(); + Some(( + shuffle_imm_as_le_lane_idx(8, &bytes[0..8])?, + shuffle_imm_as_le_lane_idx(8, &bytes[8..16])?, + )) + } + /// Attempts to interpret the shuffle immediate `imm` as a shuffle of /// 32-bit lanes, returning four integers, each of which is less than 8, /// which represents a permutation of 32-bit lanes as specified by diff --git a/cranelift/codegen/src/prelude_lower.isle b/cranelift/codegen/src/prelude_lower.isle index a7e59d5908a8..a0498fae4d24 100644 --- a/cranelift/codegen/src/prelude_lower.isle +++ b/cranelift/codegen/src/prelude_lower.isle @@ -597,6 +597,8 @@ ;; returned will be in the range of 0 to (256/N)-1, inclusive, and index the ;; N-bit chunks of two concatenated 128-bit vectors starting from the ;; least-significant bits. +(decl shuffle64_from_imm (u8 u8) Immediate) +(extern extractor shuffle64_from_imm shuffle64_from_imm) (decl shuffle32_from_imm (u8 u8 u8 u8) Immediate) (extern extractor shuffle32_from_imm shuffle32_from_imm) (decl shuffle16_from_imm (u8 u8 u8 u8 u8 u8 u8 u8) Immediate) diff --git a/cranelift/filetests/filetests/isa/aarch64/shuffle.clif b/cranelift/filetests/filetests/isa/aarch64/shuffle.clif new file mode 100644 index 000000000000..c52959b8862f --- /dev/null +++ b/cranelift/filetests/filetests/isa/aarch64/shuffle.clif @@ -0,0 +1,747 @@ +test compile precise-output +set unwind_info=false +target aarch64 + +function %shuffle_i8x16(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle v0, v1, [3 0 31 26 4 6 12 11 23 13 24 4 2 15 17 5] + return v2 +} + +; VCode: +; block0: +; mov v30.16b, v0.16b +; mov v31.16b, v1.16b +; ldr q3, pc+8 ; b 20 ; data.f128 0x05110f0204180d170b0c06041a1f0003 +; tbl v0.16b, { v30.16b, v31.16b }, v3.16b +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; mov v30.16b, v0.16b +; mov v31.16b, v1.16b +; ldr q3, #0x10 +; b #0x20 +; adc w3, w0, wzr +; add w4, w16, w12, lsl #1 +; orr z23.b, p3/m, z23.b, z8.b +; mov z2.b, p1/z, #0x78 +; tbl v0.16b, {v30.16b, v31.16b}, v3.16b +; ret + +function %aarch64_uzp1_i8x16(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle v0, v1, [0 2 4 6 8 10 12 14 16 18 20 22 24 26 28 30] + return v2 +} + +; VCode: +; block0: +; uzp1 v0.16b, v0.16b, v1.16b +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; uzp1 v0.16b, v0.16b, v1.16b +; ret + +function %aarch64_uzp2_i8x16(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle v0, v1, [1 3 5 7 9 11 13 15 17 19 21 23 25 27 29 31] + return v2 +} + +; VCode: +; block0: +; uzp2 v0.16b, v0.16b, v1.16b +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; uzp2 v0.16b, v0.16b, v1.16b +; ret + +function %aarch64_uzp1_i16x8(i16x8, i16x8) -> i16x8 { +block0(v0: i16x8, v1: i16x8): + v2 = bitcast.i8x16 little v0 + v3 = bitcast.i8x16 little v1 + v4 = shuffle v2, v3, [0 1 4 5 8 9 12 13 16 17 20 21 24 25 28 29] + v5 = bitcast.i16x8 little v4 + return v5 +} + +; VCode: +; block0: +; uzp1 v0.8h, v0.8h, v1.8h +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; uzp1 v0.8h, v0.8h, v1.8h +; ret + +function %aarch64_uzp2_i16x8(i16x8, i16x8) -> i16x8 { +block0(v0: i16x8, v1: i16x8): + v2 = bitcast.i8x16 little v0 + v3 = bitcast.i8x16 little v1 + v4 = shuffle v2, v3, [2 3 6 7 10 11 14 15 18 19 22 23 26 27 30 31] + v5 = bitcast.i16x8 little v4 + return v5 +} + +; VCode: +; block0: +; uzp2 v0.8h, v0.8h, v1.8h +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; uzp2 v0.8h, v0.8h, v1.8h +; ret + +function %aarch64_uzp1_i32x4(i32x4, i32x4) -> i32x4 { +block0(v0: i32x4, v1: i32x4): + v2 = bitcast.i8x16 little v0 + v3 = bitcast.i8x16 little v1 + v4 = shuffle v2, v3, [0 1 2 3 8 9 10 11 16 17 18 19 24 25 26 27] + v5 = bitcast.i32x4 little v4 + return v5 +} + +; VCode: +; block0: +; uzp1 v0.4s, v0.4s, v1.4s +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; uzp1 v0.4s, v0.4s, v1.4s +; ret + +function %aarch64_uzp2_i32x4(i32x4, i32x4) -> i32x4 { +block0(v0: i32x4, v1: i32x4): + v2 = bitcast.i8x16 little v0 + v3 = bitcast.i8x16 little v1 + v4 = shuffle v2, v3, [4 5 6 7 12 13 14 15 20 21 22 23 28 29 30 31] + v5 = bitcast.i32x4 little v4 + return v5 +} + +; VCode: +; block0: +; uzp2 v0.4s, v0.4s, v1.4s +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; uzp2 v0.4s, v0.4s, v1.4s +; ret + +function %aarch64_uzp1_i64x2(i64x2, i64x2) -> i64x2 { +block0(v0: i64x2, v1: i64x2): + v2 = bitcast.i8x16 little v0 + v3 = bitcast.i8x16 little v1 + v4 = shuffle v2, v3, [0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23] + v5 = bitcast.i64x2 little v4 + return v5 +} + +; VCode: +; block0: +; uzp1 v0.2d, v0.2d, v1.2d +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; uzp1 v0.2d, v0.2d, v1.2d +; ret + +function %aarch64_uzp2_i64x2(i64x2, i64x2) -> i64x2 { +block0(v0: i64x2, v1: i64x2): + v2 = bitcast.i8x16 little v0 + v3 = bitcast.i8x16 little v1 + v4 = shuffle v2, v3, [8 9 10 11 12 13 14 15 24 25 26 27 28 29 30 31] + v5 = bitcast.i64x2 little v4 + return v5 +} + +; VCode: +; block0: +; uzp2 v0.2d, v0.2d, v1.2d +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; uzp2 v0.2d, v0.2d, v1.2d +; ret + +function %punpcklbw(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle v0, v1, [0 16 1 17 2 18 3 19 4 20 5 21 6 22 7 23] + return v2 +} + +; VCode: +; block0: +; zip1 v0.16b, v0.16b, v1.16b +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; zip1 v0.16b, v0.16b, v1.16b +; ret + +function %punpckhbw(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle v0, v1, [8 24 9 25 10 26 11 27 12 28 13 29 14 30 15 31] + return v2 +} + +; VCode: +; block0: +; zip2 v0.16b, v0.16b, v1.16b +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; zip2 v0.16b, v0.16b, v1.16b +; ret + +function %punpcklwd(i16x8, i16x8) -> i16x8 { +block0(v0: i16x8, v1: i16x8): + v2 = bitcast.i8x16 little v0 + v3 = bitcast.i8x16 little v1 + v4 = shuffle v2, v3, [0 1 16 17 2 3 18 19 4 5 20 21 6 7 22 23] + v5 = bitcast.i16x8 little v4 + return v5 +} + +; VCode: +; block0: +; zip1 v0.8h, v0.8h, v1.8h +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; zip1 v0.8h, v0.8h, v1.8h +; ret + +function %punpckhwd(i16x8, i16x8) -> i16x8 { +block0(v0: i16x8, v1: i16x8): + v2 = bitcast.i8x16 little v0 + v3 = bitcast.i8x16 little v1 + v4 = shuffle v2, v3, [8 9 24 25 10 11 26 27 12 13 28 29 14 15 30 31] + v5 = bitcast.i16x8 little v4 + return v5 +} + +; VCode: +; block0: +; zip2 v0.8h, v0.8h, v1.8h +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; zip2 v0.8h, v0.8h, v1.8h +; ret + +function %punpckldq(i32x4, i32x4) -> i32x4 { +block0(v0: i32x4, v1: i32x4): + v2 = bitcast.i8x16 little v0 + v3 = bitcast.i8x16 little v1 + v4 = shuffle v2, v3, [0 1 2 3 16 17 18 19 4 5 6 7 20 21 22 23] + v5 = bitcast.i32x4 little v4 + return v5 +} + +; VCode: +; block0: +; zip1 v0.4s, v0.4s, v1.4s +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; zip1 v0.4s, v0.4s, v1.4s +; ret + +function %punpckhdq(i32x4, i32x4) -> i32x4 { +block0(v0: i32x4, v1: i32x4): + v2 = bitcast.i8x16 little v0 + v3 = bitcast.i8x16 little v1 + v4 = shuffle v2, v3, [8 9 10 11 24 25 26 27 12 13 14 15 28 29 30 31] + v5 = bitcast.i32x4 little v4 + return v5 +} + +; VCode: +; block0: +; zip2 v0.4s, v0.4s, v1.4s +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; zip2 v0.4s, v0.4s, v1.4s +; ret + +function %punpcklqdq(i64x2, i64x2) -> i64x2 { +block0(v0: i64x2, v1: i64x2): + v2 = bitcast.i8x16 little v0 + v3 = bitcast.i8x16 little v1 + v4 = shuffle v2, v3, [0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23] + v5 = bitcast.i64x2 little v4 + return v5 +} + +; VCode: +; block0: +; uzp1 v0.2d, v0.2d, v1.2d +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; uzp1 v0.2d, v0.2d, v1.2d +; ret + +function %punpckhqdq(i64x2, i64x2) -> i64x2 { +block0(v0: i64x2, v1: i64x2): + v2 = bitcast.i8x16 little v0 + v3 = bitcast.i8x16 little v1 + v4 = shuffle v2, v3, [8 9 10 11 12 13 14 15 24 25 26 27 28 29 30 31] + v5 = bitcast.i64x2 little v4 + return v5 +} + +; VCode: +; block0: +; uzp2 v0.2d, v0.2d, v1.2d +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; uzp2 v0.2d, v0.2d, v1.2d +; ret + +function %aarch64_trn1_i8x16(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle v0, v1, [0 16 2 18 4 20 6 22 8 24 10 26 12 28 14 30] + return v2 +} + +; VCode: +; block0: +; trn1 v0.16b, v0.16b, v1.16b +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; trn1 v0.16b, v0.16b, v1.16b +; ret + +function %aarch64_trn2_i8x16(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle v0, v1, [1 17 3 19 5 21 7 23 9 25 11 27 13 29 15 31] + return v2 +} + +; VCode: +; block0: +; trn2 v0.16b, v0.16b, v1.16b +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; trn2 v0.16b, v0.16b, v1.16b +; ret + +function %aarch64_trn1_i16x8(i16x8, i16x8) -> i16x8 { +block0(v0: i16x8, v1: i16x8): + v2 = bitcast.i8x16 little v0 + v3 = bitcast.i8x16 little v1 + v4 = shuffle v2, v3, [0 1 16 17 4 5 20 21 8 9 24 25 12 13 28 29] + v5 = bitcast.i16x8 little v4 + return v5 +} + +; VCode: +; block0: +; trn1 v0.8h, v0.8h, v1.8h +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; trn1 v0.8h, v0.8h, v1.8h +; ret + +function %aarch64_trn2_i16x8(i16x8, i16x8) -> i16x8 { +block0(v0: i16x8, v1: i16x8): + v2 = bitcast.i8x16 little v0 + v3 = bitcast.i8x16 little v1 + v4 = shuffle v2, v3, [2 3 18 19 6 7 22 23 10 11 26 27 14 15 30 31] + v5 = bitcast.i16x8 little v4 + return v5 +} + +; VCode: +; block0: +; trn2 v0.8h, v0.8h, v1.8h +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; trn2 v0.8h, v0.8h, v1.8h +; ret + +function %aarch64_trn1_i32x4(i32x4, i32x4) -> i32x4 { +block0(v0: i32x4, v1: i32x4): + v2 = bitcast.i8x16 little v0 + v3 = bitcast.i8x16 little v1 + v4 = shuffle v2, v3, [0 1 2 3 16 17 18 19 8 9 10 11 24 25 26 27] + v5 = bitcast.i32x4 little v4 + return v5 +} + +; VCode: +; block0: +; trn1 v0.4s, v0.4s, v1.4s +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; trn1 v0.4s, v0.4s, v1.4s +; ret + +function %aarch64_trn2_i32x4(i32x4, i32x4) -> i32x4 { +block0(v0: i32x4, v1: i32x4): + v2 = bitcast.i8x16 little v0 + v3 = bitcast.i8x16 little v1 + v4 = shuffle v2, v3, [4 5 6 7 20 21 22 23 12 13 14 15 28 29 30 31] + v5 = bitcast.i32x4 little v4 + return v5 +} + +; VCode: +; block0: +; trn2 v0.4s, v0.4s, v1.4s +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; trn2 v0.4s, v0.4s, v1.4s +; ret + +function %aarch64_trn1_i64x2(i64x2, i64x2) -> i64x2 { +block0(v0: i64x2, v1: i64x2): + v2 = bitcast.i8x16 little v0 + v3 = bitcast.i8x16 little v1 + v4 = shuffle v2, v3, [0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23] + v5 = bitcast.i64x2 little v4 + return v5 +} + +; VCode: +; block0: +; uzp1 v0.2d, v0.2d, v1.2d +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; uzp1 v0.2d, v0.2d, v1.2d +; ret + +function %aarch64_trn2_i64x2(i64x2, i64x2) -> i64x2 { +block0(v0: i64x2, v1: i64x2): + v2 = bitcast.i8x16 little v0 + v3 = bitcast.i8x16 little v1 + v4 = shuffle v2, v3, [8 9 10 11 12 13 14 15 24 25 26 27 28 29 30 31] + v5 = bitcast.i64x2 little v4 + return v5 +} + +; VCode: +; block0: +; uzp2 v0.2d, v0.2d, v1.2d +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; uzp2 v0.2d, v0.2d, v1.2d +; ret + +function %aarch64_ext_0(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle v0, v1, [0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15] + return v2 +} + +; VCode: +; block0: +; ext v0.16b, v0.16b, v1.16b, #0 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; ext v0.16b, v0.16b, v1.16b, #0 +; ret + +function %aarch64_ext_1(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle v0, v1, [1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16] + return v2 +} + +; VCode: +; block0: +; ext v0.16b, v0.16b, v1.16b, #1 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; ext v0.16b, v0.16b, v1.16b, #1 +; ret + +function %aarch64_ext_5(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle v0, v1, [5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20] + return v2 +} + +; VCode: +; block0: +; ext v0.16b, v0.16b, v1.16b, #5 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; ext v0.16b, v0.16b, v1.16b, #5 +; ret + +function %aarch64_ext_11(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle v0, v1, [11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26] + return v2 +} + +; VCode: +; block0: +; ext v0.16b, v0.16b, v1.16b, #11 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; ext v0.16b, v0.16b, v1.16b, #0xb +; ret + +function %aarch64_ext_16(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle v0, v1, [16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31] + return v2 +} + +; VCode: +; block0: +; mov v30.16b, v0.16b +; mov v31.16b, v1.16b +; ldr q3, pc+8 ; b 20 ; data.f128 0x1f1e1d1c1b1a19181716151413121110 +; tbl v0.16b, { v30.16b, v31.16b }, v3.16b +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; mov v30.16b, v0.16b +; mov v31.16b, v1.16b +; ldr q3, #0x10 +; b #0x20 +; sbfiz w16, w8, #0xe, #5 +; b #0xfffffffffc585464 +; madd w24, w8, w26, w6 +; fmadd s28, s8, s30, s7 +; tbl v0.16b, {v30.16b, v31.16b}, v3.16b +; ret + +function %aarch64_dup_i8x16(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle v0, v1, [5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5] + return v2 +} + +; VCode: +; block0: +; dup v0.16b, v0.b[5] +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; dup v0.16b, v0.b[5] +; ret + +function %aarch64_dup_i16x8(i16x8, i16x8) -> i16x8 { +block0(v0: i16x8, v1: i16x8): + v2 = bitcast.i8x16 little v0 + v3 = bitcast.i8x16 little v1 + v4 = shuffle v2, v3, [12 13 12 13 12 13 12 13 12 13 12 13 12 13 12 13] + v5 = bitcast.i16x8 little v4 + return v5 +} + +; VCode: +; block0: +; dup v0.8h, v0.h[6] +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; dup v0.8h, v0.h[6] +; ret + +function %aarch64_dup_i32x4(i32x4, i32x4) -> i32x4 { +block0(v0: i32x4, v1: i32x4): + v2 = bitcast.i8x16 little v0 + v3 = bitcast.i8x16 little v1 + v4 = shuffle v2, v3, [12 13 14 15 12 13 14 15 12 13 14 15 12 13 14 15] + v5 = bitcast.i32x4 little v4 + return v5 +} + +; VCode: +; block0: +; dup v0.4s, v0.s[3] +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; dup v0.4s, v0.s[3] +; ret + +function %aarch64_dup_i64x2(i64x2, i64x2) -> i64x2 { +block0(v0: i64x2, v1: i64x2): + v2 = bitcast.i8x16 little v0 + v3 = bitcast.i8x16 little v1 + v4 = shuffle v2, v3, [0 1 2 3 4 5 6 7 0 1 2 3 4 5 6 7] + v5 = bitcast.i64x2 little v4 + return v5 +} + +; VCode: +; block0: +; dup v0.2d, v0.d[0] +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; dup v0.2d, v0.d[0] +; ret + +function %aarch64_rev16(i16x8, i16x8) -> i16x8 { +block0(v0: i16x8, v1: i16x8): + v2 = bitcast.i8x16 little v0 + v3 = bitcast.i8x16 little v1 + v4 = shuffle v2, v3, [1 0 3 2 5 4 7 6 9 8 11 10 13 12 15 14] + v5 = bitcast.i16x8 little v4 + return v5 +} + +; VCode: +; block0: +; rev16 v0.16b, v0.16b +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; rev16 v0.16b, v0.16b +; ret + +function %aarch64_rev32_bytes(i32x4, i32x4) -> i32x4 { +block0(v0: i32x4, v1: i32x4): + v2 = bitcast.i8x16 little v0 + v3 = bitcast.i8x16 little v1 + v4 = shuffle v2, v3, [3 2 1 0 7 6 5 4 11 10 9 8 15 14 13 12] + v5 = bitcast.i32x4 little v4 + return v5 +} + +; VCode: +; block0: +; rev32 v0.16b, v0.16b +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; rev32 v0.16b, v0.16b +; ret + +function %aarch64_rev32_words(i32x4, i32x4) -> i32x4 { +block0(v0: i32x4, v1: i32x4): + v2 = bitcast.i8x16 little v0 + v3 = bitcast.i8x16 little v1 + v4 = shuffle v2, v3, [2 3 0 1 6 7 4 5 10 11 8 9 14 15 12 13] + v5 = bitcast.i32x4 little v4 + return v5 +} + +; VCode: +; block0: +; rev32 v0.8h, v0.8h +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; rev32 v0.8h, v0.8h +; ret + +function %aarch64_rev64_bytes(i64x2, i64x2) -> i64x2 { +block0(v0: i64x2, v1: i64x2): + v2 = bitcast.i8x16 little v0 + v3 = bitcast.i8x16 little v1 + v4 = shuffle v2, v3, [7 6 5 4 3 2 1 0 15 14 13 12 11 10 9 8] + v5 = bitcast.i64x2 little v4 + return v5 +} + +; VCode: +; block0: +; rev64 v0.16b, v0.16b +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; rev64 v0.16b, v0.16b +; ret + +function %aarch64_rev64_words(i64x2, i64x2) -> i64x2 { +block0(v0: i64x2, v1: i64x2): + v2 = bitcast.i8x16 little v0 + v3 = bitcast.i8x16 little v1 + v4 = shuffle v2, v3, [6 7 4 5 2 3 0 1 14 15 12 13 10 11 8 9] + v5 = bitcast.i64x2 little v4 + return v5 +} + +; VCode: +; block0: +; rev64 v0.8h, v0.8h +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; rev64 v0.8h, v0.8h +; ret + +function %aarch64_rev64_doublewords(i64x2, i64x2) -> i64x2 { +block0(v0: i64x2, v1: i64x2): + v2 = bitcast.i8x16 little v0 + v3 = bitcast.i8x16 little v1 + v4 = shuffle v2, v3, [4 5 6 7 0 1 2 3 12 13 14 15 8 9 10 11] + v5 = bitcast.i64x2 little v4 + return v5 +} + +; VCode: +; block0: +; rev64 v0.4s, v0.4s +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; rev64 v0.4s, v0.4s +; ret + diff --git a/cranelift/filetests/filetests/runtests/simd-shuffle.clif b/cranelift/filetests/filetests/runtests/simd-shuffle.clif index 60b515628d36..6056fb3dd641 100644 --- a/cranelift/filetests/filetests/runtests/simd-shuffle.clif +++ b/cranelift/filetests/filetests/runtests/simd-shuffle.clif @@ -258,3 +258,295 @@ block0(v0: i8x16, v1: i8x16): return v2 } ; run: %shuffle_all_zeros([5 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1], [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]) == [5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5] + +function %aarch64_uzp1_i8x16(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle v0, v1, [0 2 4 6 8 10 12 14 16 18 20 22 24 26 28 30] + return v2 +} +; run: %aarch64_uzp1_i8x16([1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16], [17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32]) == [1 3 5 7 9 11 13 15 17 19 21 23 25 27 29 31] + +function %aarch64_uzp2_i8x16(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle v0, v1, [1 3 5 7 9 11 13 15 17 19 21 23 25 27 29 31] + return v2 +} +; run: %aarch64_uzp2_i8x16([1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16], [17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32]) == [2 4 6 8 10 12 14 16 18 20 22 24 26 28 30 32] + +function %aarch64_uzp1_i16x8(i16x8, i16x8) -> i16x8 { +block0(v0: i16x8, v1: i16x8): + v2 = bitcast.i8x16 little v0 + v3 = bitcast.i8x16 little v1 + v4 = shuffle v2, v3, [0 1 4 5 8 9 12 13 16 17 20 21 24 25 28 29] + v5 = bitcast.i16x8 little v4 + return v5 +} +; run: %aarch64_uzp1_i16x8([1 2 3 4 5 6 7 8], [9 10 11 12 13 14 15 16]) == [1 3 5 7 9 11 13 15] + +function %aarch64_uzp2_i16x8(i16x8, i16x8) -> i16x8 { +block0(v0: i16x8, v1: i16x8): + v2 = bitcast.i8x16 little v0 + v3 = bitcast.i8x16 little v1 + v4 = shuffle v2, v3, [2 3 6 7 10 11 14 15 18 19 22 23 26 27 30 31] + v5 = bitcast.i16x8 little v4 + return v5 +} +; run: %aarch64_uzp2_i16x8([1 2 3 4 5 6 7 8], [9 10 11 12 13 14 15 16]) == [2 4 6 8 10 12 14 16] + +function %aarch64_uzp1_i32x4(i32x4, i32x4) -> i32x4 { +block0(v0: i32x4, v1: i32x4): + v2 = bitcast.i8x16 little v0 + v3 = bitcast.i8x16 little v1 + v4 = shuffle v2, v3, [0 1 2 3 8 9 10 11 16 17 18 19 24 25 26 27] + v5 = bitcast.i32x4 little v4 + return v5 +} +; run: %aarch64_uzp1_i32x4([1 2 3 4], [5 6 7 8]) == [1 3 5 7] + +function %aarch64_uzp2_i32x4(i32x4, i32x4) -> i32x4 { +block0(v0: i32x4, v1: i32x4): + v2 = bitcast.i8x16 little v0 + v3 = bitcast.i8x16 little v1 + v4 = shuffle v2, v3, [4 5 6 7 12 13 14 15 20 21 22 23 28 29 30 31] + v5 = bitcast.i32x4 little v4 + return v5 +} +; run: %aarch64_uzp2_i32x4([1 2 3 4], [5 6 7 8]) == [2 4 6 8] + +function %aarch64_uzp1_i64x2(i64x2, i64x2) -> i64x2 { +block0(v0: i64x2, v1: i64x2): + v2 = bitcast.i8x16 little v0 + v3 = bitcast.i8x16 little v1 + v4 = shuffle v2, v3, [0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23] + v5 = bitcast.i64x2 little v4 + return v5 +} +; run: %aarch64_uzp1_i64x2([1 2], [3 4]) == [1 3] + +function %aarch64_uzp2_i64x2(i64x2, i64x2) -> i64x2 { +block0(v0: i64x2, v1: i64x2): + v2 = bitcast.i8x16 little v0 + v3 = bitcast.i8x16 little v1 + v4 = shuffle v2, v3, [8 9 10 11 12 13 14 15 24 25 26 27 28 29 30 31] + v5 = bitcast.i64x2 little v4 + return v5 +} +; run: %aarch64_uzp1_i64x2([1 2], [3 4]) == [2 4] + +function %aarch64_trn1_i8x16(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle v0, v1, [0 16 2 18 4 20 6 22 8 24 10 26 12 28 14 30] + return v2 +} +; run: %aarch64_trn1_i8x16([1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16], [17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32]) == [1 17 3 19 5 21 7 23 9 25 11 27 13 29 15 31] + +function %aarch64_trn2_i8x16(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle v0, v1, [1 17 3 19 5 21 7 23 9 25 11 27 13 29 15 31] + return v2 +} +; run: %aarch64_trn2_i8x16([1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16], [17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32]) == [2 18 4 20 6 22 8 24 10 26 12 28 14 30 16 32] + +function %aarch64_trn1_i16x8(i16x8, i16x8) -> i16x8 { +block0(v0: i16x8, v1: i16x8): + v2 = bitcast.i8x16 little v0 + v3 = bitcast.i8x16 little v1 + v4 = shuffle v2, v3, [0 1 16 17 4 5 20 21 8 9 24 25 12 13 28 29] + v5 = bitcast.i16x8 little v4 + return v5 +} +; run: %aarch64_trn1_i16x8([1 2 3 4 5 6 7 8], [9 10 11 12 13 14 15 16]) == [1 9 3 11 5 13 7 15] + +function %aarch64_trn2_i16x8(i16x8, i16x8) -> i16x8 { +block0(v0: i16x8, v1: i16x8): + v2 = bitcast.i8x16 little v0 + v3 = bitcast.i8x16 little v1 + v4 = shuffle v2, v3, [2 3 18 19 6 7 22 23 10 11 26 27 14 15 30 31] + v5 = bitcast.i16x8 little v4 + return v5 +} +; run: %aarch64_trn2_i16x8([1 2 3 4 5 6 7 8], [9 10 11 12 13 14 15 16]) == [2 10 4 12 6 14 8 16] + +function %aarch64_trn1_i32x4(i32x4, i32x4) -> i32x4 { +block0(v0: i32x4, v1: i32x4): + v2 = bitcast.i8x16 little v0 + v3 = bitcast.i8x16 little v1 + v4 = shuffle v2, v3, [0 1 2 3 16 17 18 19 8 9 10 11 24 25 26 27] + v5 = bitcast.i32x4 little v4 + return v5 +} +; run: %aarch64_trn1_i32x4([1 2 3 4], [5 6 7 8]) == [1 5 3 7] + +function %aarch64_trn2_i32x4(i32x4, i32x4) -> i32x4 { +block0(v0: i32x4, v1: i32x4): + v2 = bitcast.i8x16 little v0 + v3 = bitcast.i8x16 little v1 + v4 = shuffle v2, v3, [4 5 6 7 20 21 22 23 12 13 14 15 28 29 30 31] + v5 = bitcast.i32x4 little v4 + return v5 +} +; run: %aarch64_trn2_i32x4([1 2 3 4], [5 6 7 8]) == [2 6 4 8] + +function %aarch64_trn1_i64x2(i64x2, i64x2) -> i64x2 { +block0(v0: i64x2, v1: i64x2): + v2 = bitcast.i8x16 little v0 + v3 = bitcast.i8x16 little v1 + v4 = shuffle v2, v3, [0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23] + v5 = bitcast.i64x2 little v4 + return v5 +} +; run: %aarch64_trn1_i64x2([1 2], [3 4]) == [1 3] + +function %aarch64_trn2_i64x2(i64x2, i64x2) -> i64x2 { +block0(v0: i64x2, v1: i64x2): + v2 = bitcast.i8x16 little v0 + v3 = bitcast.i8x16 little v1 + v4 = shuffle v2, v3, [8 9 10 11 12 13 14 15 24 25 26 27 28 29 30 31] + v5 = bitcast.i64x2 little v4 + return v5 +} +; run: %aarch64_trn2_i64x2([1 2], [3 4]) == [2 4] + +function %aarch64_ext_0(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle v0, v1, [0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15] + return v2 +} +; run: %aarch64_ext_0([1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16], [17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32]) == [1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16] + +function %aarch64_ext_1(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle v0, v1, [1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16] + return v2 +} +; run: %aarch64_ext_1([1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16], [17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32]) == [2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17] + +function %aarch64_ext_5(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle v0, v1, [5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20] + return v2 +} +; run: %aarch64_ext_5([1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16], [17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32]) == [6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21] + +function %aarch64_ext_11(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle v0, v1, [11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26] + return v2 +} +; run: %aarch64_ext_11([1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16], [17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32]) == [12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27] + +function %aarch64_ext_16(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle v0, v1, [16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31] + return v2 +} +; run: %aarch64_ext_16([1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16], [17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32]) == [17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32] + +function %aarch64_dup_i8x16(i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16): + v2 = shuffle v0, v1, [5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5] + return v2 +} +; run: %aarch64_dup_i8x16([1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16], [17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32]) == [6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6] + +function %aarch64_dup_i16x8(i16x8, i16x8) -> i16x8 { +block0(v0: i16x8, v1: i16x8): + v2 = bitcast.i8x16 little v0 + v3 = bitcast.i8x16 little v1 + v4 = shuffle v2, v3, [12 13 12 13 12 13 12 13 12 13 12 13 12 13 12 13] + v5 = bitcast.i16x8 little v4 + return v5 +} +; run: %aarch64_dup_i16x8([1 2 3 4 5 6 7 8], [9 10 11 12 13 14 15 16]) == [7 7 7 7 7 7 7 7] + +function %aarch64_dup_i32x4_1(i32x4, i32x4) -> i32x4 { +block0(v0: i32x4, v1: i32x4): + v2 = bitcast.i8x16 little v0 + v3 = bitcast.i8x16 little v1 + v4 = shuffle v2, v3, [8 9 10 11 8 9 10 11 8 9 10 11 8 9 10 11] + v5 = bitcast.i32x4 little v4 + return v5 +} +; run: %aarch64_dup_i32x4_1([1 2 3 4], [5 6 7 8]) == [3 3 3 3] + +function %aarch64_dup_i32x4_2(i32x4, i32x4) -> i32x4 { +block0(v0: i32x4, v1: i32x4): + v2 = bitcast.i8x16 little v0 + v3 = bitcast.i8x16 little v1 + v4 = shuffle v2, v3, [16 17 18 19 16 17 18 19 16 17 18 19 16 17 18 19] + v5 = bitcast.i32x4 little v4 + return v5 +} +; run: %aarch64_dup_i32x4_2([1 2 3 4], [5 6 7 8]) == [5 5 5 5] + +function %aarch64_dup_i64x2(i64x2, i64x2) -> i64x2 { +block0(v0: i64x2, v1: i64x2): + v2 = bitcast.i8x16 little v0 + v3 = bitcast.i8x16 little v1 + v4 = shuffle v2, v3, [0 1 2 3 4 5 6 7 0 1 2 3 4 5 6 7] + v5 = bitcast.i64x2 little v4 + return v5 +} +; run: %aarch64_dup_i64x2([1 2], [5 6]) == [1 1] + +function %aarch64_rev16(i16x8, i16x8) -> i16x8 { +block0(v0: i16x8, v1: i16x8): + v2 = bitcast.i8x16 little v0 + v3 = bitcast.i8x16 little v1 + v4 = shuffle v2, v3, [1 0 3 2 5 4 7 6 9 8 11 10 13 12 15 14] + v5 = bitcast.i16x8 little v4 + return v5 +} +; run: %aarch64_rev16([0x1 0x2 0x3 0x4 0x100 0x200 0x300 0x400], [0 0 0 0 0 0 0 0]) == [0x100 0x200 0x300 0x400 0x1 0x2 0x3 0x4] + +function %aarch64_rev32_bytes(i32x4, i32x4) -> i32x4 { +block0(v0: i32x4, v1: i32x4): + v2 = bitcast.i8x16 little v0 + v3 = bitcast.i8x16 little v1 + v4 = shuffle v2, v3, [3 2 1 0 7 6 5 4 11 10 9 8 15 14 13 12] + v5 = bitcast.i32x4 little v4 + return v5 +} +; run: %aarch64_rev32_bytes([1 2 3 4], [0 0 0 0]) == [0x1000000 0x2000000 0x3000000 0x4000000] +; run: %aarch64_rev32_bytes([0xff000000 0x00ff0000 0x0000ff00 0x000000ff], [0 0 0 0]) == [0xff 0xff00 0xff0000 0xff000000] + +function %aarch64_rev32_words(i32x4, i32x4) -> i32x4 { +block0(v0: i32x4, v1: i32x4): + v2 = bitcast.i8x16 little v0 + v3 = bitcast.i8x16 little v1 + v4 = shuffle v2, v3, [2 3 0 1 6 7 4 5 10 11 8 9 14 15 12 13] + v5 = bitcast.i32x4 little v4 + return v5 +} +; run: %aarch64_rev32_words([1 2 3 4], [0 0 0 0]) == [0x10000 0x20000 0x30000 0x40000] +; run: %aarch64_rev32_words([0xff000000 0x00ff0000 0x0000ff00 0x000000ff], [0 0 0 0]) == [0xff00 0xff 0xff000000 0xff0000] + +function %aarch64_rev64_bytes(i64x2, i64x2) -> i64x2 { +block0(v0: i64x2, v1: i64x2): + v2 = bitcast.i8x16 little v0 + v3 = bitcast.i8x16 little v1 + v4 = shuffle v2, v3, [7 6 5 4 3 2 1 0 15 14 13 12 11 10 9 8] + v5 = bitcast.i64x2 little v4 + return v5 +} +; run: %aarch64_rev64_bytes([0x0102030405060708 0x0807060504030201], [0 0]) == [0x0807060504030201 0x0102030405060708] + +function %aarch64_rev64_words(i64x2, i64x2) -> i64x2 { +block0(v0: i64x2, v1: i64x2): + v2 = bitcast.i8x16 little v0 + v3 = bitcast.i8x16 little v1 + v4 = shuffle v2, v3, [6 7 4 5 2 3 0 1 14 15 12 13 10 11 8 9] + v5 = bitcast.i64x2 little v4 + return v5 +} +; run: %aarch64_rev64_words([0x0102030405060708 0x0807060504030201], [0 0]) == [0x0708050603040102 0x0201040306050807] + +function %aarch64_rev64_doublewords(i64x2, i64x2) -> i64x2 { +block0(v0: i64x2, v1: i64x2): + v2 = bitcast.i8x16 little v0 + v3 = bitcast.i8x16 little v1 + v4 = shuffle v2, v3, [4 5 6 7 0 1 2 3 12 13 14 15 8 9 10 11] + v5 = bitcast.i64x2 little v4 + return v5 +} +; run: %aarch64_rev64_words([0x0102030405060708 0x0807060504030201], [0 0]) == [0x0506070801020304 0x0403020108070605]