Merge pull request #2985 from afonso360/aarch64-i128-load-store

aarch64: Implement I128 Loads and Stores
bytecodealliance · Jun 17, 2021 · de1edd4 · de1edd4
2 parents fb07ff5 + c827646
commit de1edd4
Show file tree

Hide file tree

Showing 5 changed files with 622 additions and 94 deletions.
diff --git a/cranelift/codegen/src/isa/aarch64/lower.rs b/cranelift/codegen/src/isa/aarch64/lower.rs
@@ -692,6 +692,63 @@ fn collect_address_addends<C: LowerCtx<I = Inst>>(
     (result64, result32, offset)
 }
 
+/// Lower the address of a pair load or store.
+pub(crate) fn lower_pair_address<C: LowerCtx<I = Inst>>(
+    ctx: &mut C,
+    roots: &[InsnInput],
+    offset: i32,
+) -> PairAMode {
+    // Collect addends through an arbitrary tree of 32-to-64-bit sign/zero
+    // extends and addition ops. We update these as we consume address
+    // components, so they represent the remaining addends not yet handled.
+    let (mut addends64, mut addends32, args_offset) = collect_address_addends(ctx, roots);
+    let offset = args_offset + (offset as i64);
+
+    trace!(
+        "lower_pair_address: addends64 {:?}, addends32 {:?}, offset {}",
+        addends64,
+        addends32,
+        offset
+    );
+
+    // Pairs basically only have reg + imm formats so we only have to worry about those
+
+    let base_reg = if let Some(reg64) = addends64.pop() {
+        reg64
+    } else if let Some((reg32, extendop)) = addends32.pop() {
+        let tmp = ctx.alloc_tmp(I64).only_reg().unwrap();
+        let signed = match extendop {
+            ExtendOp::SXTW => true,
+            ExtendOp::UXTW => false,
+            _ => unreachable!(),
+        };
+        ctx.emit(Inst::Extend {
+            rd: tmp,
+            rn: reg32,
+            signed,
+            from_bits: 32,
+            to_bits: 64,
+        });
+        tmp.to_reg()
+    } else {
+        zero_reg()
+    };
+
+    let addr = ctx.alloc_tmp(I64).only_reg().unwrap();
+    ctx.emit(Inst::gen_move(addr, base_reg, I64));
+
+    // We have the base register, if we have any others, we need to add them
+    lower_add_addends(ctx, addr, addends64, addends32);
+
+    // Figure out what offset we should emit
+    let imm7 = SImm7Scaled::maybe_from_i64(offset, I64).unwrap_or_else(|| {
+        lower_add_immediate(ctx, addr, addr.to_reg(), offset);
+        SImm7Scaled::maybe_from_i64(0, I64).unwrap()
+    });
+
+    PairAMode::SignedOffset(addr.to_reg(), imm7)
+}
+
 /// Lower the address of a load or store.
 pub(crate) fn lower_address<C: LowerCtx<I = Inst>>(
     ctx: &mut C,
@@ -792,36 +849,23 @@ pub(crate) fn lower_address<C: LowerCtx<I = Inst>>(
     // If there is any offset, load that first into `addr`, and add the `reg`
     // that we kicked out of the `AMode`; otherwise, start with that reg.
     if offset != 0 {
-        // If we can fit offset or -offset in an imm12, use an add-imm
-        // to combine the reg and offset. Otherwise, load value first then add.
-        if let Some(imm12) = Imm12::maybe_from_u64(offset as u64) {
-            ctx.emit(Inst::AluRRImm12 {
-                alu_op: ALUOp::Add64,
-                rd: addr,
-                rn: reg,
-                imm12,
-            });
-        } else if let Some(imm12) = Imm12::maybe_from_u64(offset.wrapping_neg() as u64) {
-            ctx.emit(Inst::AluRRImm12 {
-                alu_op: ALUOp::Sub64,
-                rd: addr,
-                rn: reg,
-                imm12,
-            });
-        } else {
-            lower_constant_u64(ctx, addr, offset as u64);
-            ctx.emit(Inst::AluRRR {
-                alu_op: ALUOp::Add64,
-                rd: addr,
-                rn: addr.to_reg(),
-                rm: reg,
-            });
-        }
+        lower_add_immediate(ctx, addr, reg, offset)
     } else {
         ctx.emit(Inst::gen_move(addr, reg, I64));
     }
 
     // Now handle reg64 and reg32-extended components.
+    lower_add_addends(ctx, addr, addends64, addends32);
+
+    memarg
+}
+
+fn lower_add_addends<C: LowerCtx<I = Inst>>(
+    ctx: &mut C,
+    rd: Writable<Reg>,
+    addends64: AddressAddend64List,
+    addends32: AddressAddend32List,
+) {
     for reg in addends64 {
         // If the register is the stack reg, we must move it to another reg
         // before adding it.
@@ -834,23 +878,51 @@ pub(crate) fn lower_address<C: LowerCtx<I = Inst>>(
         };
         ctx.emit(Inst::AluRRR {
             alu_op: ALUOp::Add64,
-            rd: addr,
-            rn: addr.to_reg(),
+            rd,
+            rn: rd.to_reg(),
             rm: reg,
         });
     }
     for (reg, extendop) in addends32 {
         assert!(reg != stack_reg());
         ctx.emit(Inst::AluRRRExtend {
             alu_op: ALUOp::Add64,
-            rd: addr,
-            rn: addr.to_reg(),
+            rd,
+            rn: rd.to_reg(),
             rm: reg,
             extendop,
         });
     }
+}
 
-    memarg
+/// Adds into `rd` a signed imm pattern matching the best instruction for it.
+// TODO: This function is duplicated in ctx.gen_add_imm
+fn lower_add_immediate<C: LowerCtx<I = Inst>>(ctx: &mut C, dst: Writable<Reg>, src: Reg, imm: i64) {
+    // If we can fit offset or -offset in an imm12, use an add-imm
+    // Otherwise, lower the constant first then add.
+    if let Some(imm12) = Imm12::maybe_from_u64(imm as u64) {
+        ctx.emit(Inst::AluRRImm12 {
+            alu_op: ALUOp::Add64,
+            rd: dst,
+            rn: src,
+            imm12,
+        });
+    } else if let Some(imm12) = Imm12::maybe_from_u64(imm.wrapping_neg() as u64) {
+        ctx.emit(Inst::AluRRImm12 {
+            alu_op: ALUOp::Sub64,
+            rd: dst,
+            rn: src,
+            imm12,
+        });
+    } else {
+        lower_constant_u64(ctx, dst, imm as u64);
+        ctx.emit(Inst::AluRRR {
+            alu_op: ALUOp::Add64,
+            rd: dst,
+            rn: dst.to_reg(),
+            rm: src,
+        });
+    }
 }
 
 pub(crate) fn lower_constant_u64<C: LowerCtx<I = Inst>>(
@@ -1248,7 +1320,10 @@ fn load_op_to_ty(op: Opcode) -> Option<Type> {
 
 /// Helper to lower a load instruction; this is used in several places, because
 /// a load can sometimes be merged into another operation.
-pub(crate) fn lower_load<C: LowerCtx<I = Inst>, F: FnMut(&mut C, Writable<Reg>, Type, AMode)>(
+pub(crate) fn lower_load<
+    C: LowerCtx<I = Inst>,
+    F: FnMut(&mut C, ValueRegs<Writable<Reg>>, Type, AMode),
+>(
     ctx: &mut C,
     ir_inst: IRInst,
     inputs: &[InsnInput],
@@ -1261,7 +1336,7 @@ pub(crate) fn lower_load<C: LowerCtx<I = Inst>, F: FnMut(&mut C, Writable<Reg>,
 
     let off = ctx.data(ir_inst).load_store_offset().unwrap();
     let mem = lower_address(ctx, elem_ty, &inputs[..], off);
-    let rd = get_output_reg(ctx, output).only_reg().unwrap();
+    let rd = get_output_reg(ctx, output);
 
     f(ctx, rd, elem_ty, mem);
 }

diff --git a/cranelift/codegen/src/isa/aarch64/lower_inst.rs b/cranelift/codegen/src/isa/aarch64/lower_inst.rs
@@ -1303,56 +1303,71 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                 .memflags(insn)
                 .expect("Load instruction should have memflags");
 
-            lower_load(
-                ctx,
-                insn,
-                &inputs[..],
-                outputs[0],
-                |ctx, rd, elem_ty, mem| {
-                    let is_float = ty_has_float_or_vec_representation(elem_ty);
-                    ctx.emit(match (ty_bits(elem_ty), sign_extend, is_float) {
-                        (1, _, _) => Inst::ULoad8 { rd, mem, flags },
-                        (8, false, _) => Inst::ULoad8 { rd, mem, flags },
-                        (8, true, _) => Inst::SLoad8 { rd, mem, flags },
-                        (16, false, _) => Inst::ULoad16 { rd, mem, flags },
-                        (16, true, _) => Inst::SLoad16 { rd, mem, flags },
-                        (32, false, false) => Inst::ULoad32 { rd, mem, flags },
-                        (32, true, false) => Inst::SLoad32 { rd, mem, flags },
-                        (32, _, true) => Inst::FpuLoad32 { rd, mem, flags },
-                        (64, _, false) => Inst::ULoad64 { rd, mem, flags },
-                        // Note that we treat some of the vector loads as scalar floating-point loads,
-                        // which is correct in a little endian environment.
-                        (64, _, true) => Inst::FpuLoad64 { rd, mem, flags },
-                        (128, _, _) => Inst::FpuLoad128 { rd, mem, flags },
-                        _ => panic!("Unsupported size in load"),
-                    });
-
-                    let vec_extend = match op {
-                        Opcode::Sload8x8 => Some(VecExtendOp::Sxtl8),
-                        Opcode::Sload8x8Complex => Some(VecExtendOp::Sxtl8),
-                        Opcode::Uload8x8 => Some(VecExtendOp::Uxtl8),
-                        Opcode::Uload8x8Complex => Some(VecExtendOp::Uxtl8),
-                        Opcode::Sload16x4 => Some(VecExtendOp::Sxtl16),
-                        Opcode::Sload16x4Complex => Some(VecExtendOp::Sxtl16),
-                        Opcode::Uload16x4 => Some(VecExtendOp::Uxtl16),
-                        Opcode::Uload16x4Complex => Some(VecExtendOp::Uxtl16),
-                        Opcode::Sload32x2 => Some(VecExtendOp::Sxtl32),
-                        Opcode::Sload32x2Complex => Some(VecExtendOp::Sxtl32),
-                        Opcode::Uload32x2 => Some(VecExtendOp::Uxtl32),
-                        Opcode::Uload32x2Complex => Some(VecExtendOp::Uxtl32),
-                        _ => None,
-                    };
-
-                    if let Some(t) = vec_extend {
-                        ctx.emit(Inst::VecExtend {
-                            t,
-                            rd,
-                            rn: rd.to_reg(),
-                            high_half: false,
+            let out_ty = ctx.output_ty(insn, 0);
+            if out_ty == I128 {
+                let off = ctx.data(insn).load_store_offset().unwrap();
+                let mem = lower_pair_address(ctx, &inputs[..], off);
+                let dst = get_output_reg(ctx, outputs[0]);
+                ctx.emit(Inst::LoadP64 {
+                    rt: dst.regs()[0],
+                    rt2: dst.regs()[1],
+                    mem,
+                    flags,
+                });
+            } else {
+                lower_load(
+                    ctx,
+                    insn,
+                    &inputs[..],
+                    outputs[0],
+                    |ctx, dst, elem_ty, mem| {
+                        let rd = dst.only_reg().unwrap();
+                        let is_float = ty_has_float_or_vec_representation(elem_ty);
+                        ctx.emit(match (ty_bits(elem_ty), sign_extend, is_float) {
+                            (1, _, _) => Inst::ULoad8 { rd, mem, flags },
+                            (8, false, _) => Inst::ULoad8 { rd, mem, flags },
+                            (8, true, _) => Inst::SLoad8 { rd, mem, flags },
+                            (16, false, _) => Inst::ULoad16 { rd, mem, flags },
+                            (16, true, _) => Inst::SLoad16 { rd, mem, flags },
+                            (32, false, false) => Inst::ULoad32 { rd, mem, flags },
+                            (32, true, false) => Inst::SLoad32 { rd, mem, flags },
+                            (32, _, true) => Inst::FpuLoad32 { rd, mem, flags },
+                            (64, _, false) => Inst::ULoad64 { rd, mem, flags },
+                            // Note that we treat some of the vector loads as scalar floating-point loads,
+                            // which is correct in a little endian environment.
+                            (64, _, true) => Inst::FpuLoad64 { rd, mem, flags },
+                            (128, _, true) => Inst::FpuLoad128 { rd, mem, flags },
+                            _ => panic!("Unsupported size in load"),
                         });
-                    }
-                },
-            );
+
+                        let vec_extend = match op {
+                            Opcode::Sload8x8 => Some(VecExtendOp::Sxtl8),
+                            Opcode::Sload8x8Complex => Some(VecExtendOp::Sxtl8),
+                            Opcode::Uload8x8 => Some(VecExtendOp::Uxtl8),
+                            Opcode::Uload8x8Complex => Some(VecExtendOp::Uxtl8),
+                            Opcode::Sload16x4 => Some(VecExtendOp::Sxtl16),
+                            Opcode::Sload16x4Complex => Some(VecExtendOp::Sxtl16),
+                            Opcode::Uload16x4 => Some(VecExtendOp::Uxtl16),
+                            Opcode::Uload16x4Complex => Some(VecExtendOp::Uxtl16),
+                            Opcode::Sload32x2 => Some(VecExtendOp::Sxtl32),
+                            Opcode::Sload32x2Complex => Some(VecExtendOp::Sxtl32),
+                            Opcode::Uload32x2 => Some(VecExtendOp::Uxtl32),
+                            Opcode::Uload32x2Complex => Some(VecExtendOp::Uxtl32),
+                            _ => None,
+                        };
+
+                        if let Some(t) = vec_extend {
+                            let rd = dst.only_reg().unwrap();
+                            ctx.emit(Inst::VecExtend {
+                                t,
+                                rd,
+                                rn: rd.to_reg(),
+                                high_half: false,
+                            });
+                        }
+                    },
+                );
+            }
         }
 
         Opcode::Store
@@ -1376,19 +1391,30 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                 .memflags(insn)
                 .expect("Store instruction should have memflags");
 
-            let mem = lower_address(ctx, elem_ty, &inputs[1..], off);
-            let rd = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
-
-            ctx.emit(match (ty_bits(elem_ty), is_float) {
-                (1, _) | (8, _) => Inst::Store8 { rd, mem, flags },
-                (16, _) => Inst::Store16 { rd, mem, flags },
-                (32, false) => Inst::Store32 { rd, mem, flags },
-                (32, true) => Inst::FpuStore32 { rd, mem, flags },
-                (64, false) => Inst::Store64 { rd, mem, flags },
-                (64, true) => Inst::FpuStore64 { rd, mem, flags },
-                (128, _) => Inst::FpuStore128 { rd, mem, flags },
-                _ => panic!("Unsupported size in store"),
-            });
+            let dst = put_input_in_regs(ctx, inputs[0]);
+
+            if elem_ty == I128 {
+                let mem = lower_pair_address(ctx, &inputs[1..], off);
+                ctx.emit(Inst::StoreP64 {
+                    rt: dst.regs()[0],
+                    rt2: dst.regs()[1],
+                    mem,
+                    flags,
+                });
+            } else {
+                let rd = dst.only_reg().unwrap();
+                let mem = lower_address(ctx, elem_ty, &inputs[1..], off);
+                ctx.emit(match (ty_bits(elem_ty), is_float) {
+                    (1, _) | (8, _) => Inst::Store8 { rd, mem, flags },
+                    (16, _) => Inst::Store16 { rd, mem, flags },
+                    (32, false) => Inst::Store32 { rd, mem, flags },
+                    (32, true) => Inst::FpuStore32 { rd, mem, flags },
+                    (64, false) => Inst::Store64 { rd, mem, flags },
+                    (64, true) => Inst::FpuStore64 { rd, mem, flags },
+                    (128, _) => Inst::FpuStore128 { rd, mem, flags },
+                    _ => panic!("Unsupported size in store"),
+                });
+            }
         }
 
         Opcode::StackAddr => {