Skip to content

Commit

Permalink
8308966: Add intrinsic for float/double modulo for x86 AVX2 and AVX512
Browse files Browse the repository at this point in the history
Co-authored-by: Marius Cornea <marius.cornea@intel.com>
Reviewed-by: jbhateja, sviswanathan
  • Loading branch information
2 people authored and Sandhya Viswanathan committed Jun 12, 2023
1 parent 8e4e6b0 commit 5d5ae35
Show file tree
Hide file tree
Showing 12 changed files with 902 additions and 1 deletion.
69 changes: 69 additions & 0 deletions src/hotspot/cpu/x86/assembler_x86.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3555,6 +3555,14 @@ void Assembler::movsd(Address dst, XMMRegister src) {
emit_operand(src, dst, 0);
}

void Assembler::vmovsd(XMMRegister dst, XMMRegister src, XMMRegister src2) {
assert(UseAVX > 0, "Requires some form of AVX");
InstructionMark im(this);
InstructionAttr attributes(AVX_128bit, /* rex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
int encode = vex_prefix_and_encode(src2->encoding(), src->encoding(), dst->encoding(), VEX_SIMD_F2, VEX_OPCODE_0F, &attributes);
emit_int16(0x11, (0xC0 | encode));
}

void Assembler::movss(XMMRegister dst, XMMRegister src) {
NOT_LP64(assert(VM_Version::supports_sse(), ""));
InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
Expand Down Expand Up @@ -6531,6 +6539,29 @@ void Assembler::vfmadd231sd(XMMRegister dst, XMMRegister src1, XMMRegister src2)
emit_int16((unsigned char)0xB9, (0xC0 | encode));
}

void Assembler::evfnmadd213sd(XMMRegister dst, XMMRegister src1, XMMRegister src2, EvexRoundPrefix rmode) { // Need to add rmode for rounding mode support
assert(VM_Version::supports_evex(), "");
InstructionAttr attributes(rmode, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
attributes.set_extended_context();
attributes.set_is_evex_instruction();
int encode = vex_prefix_and_encode(dst->encoding(), src1->encoding(), src2->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
emit_int16((unsigned char)0xAD, (0xC0 | encode));
}

void Assembler::vfnmadd213sd(XMMRegister dst, XMMRegister src1, XMMRegister src2) {
assert(VM_Version::supports_fma(), "");
InstructionAttr attributes(AVX_128bit, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
int encode = vex_prefix_and_encode(dst->encoding(), src1->encoding(), src2->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
emit_int16((unsigned char)0xAD, (0xC0 | encode));
}

void Assembler::vfnmadd231sd(XMMRegister dst, XMMRegister src1, XMMRegister src2) {
assert(VM_Version::supports_fma(), "");
InstructionAttr attributes(AVX_128bit, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
int encode = vex_prefix_and_encode(dst->encoding(), src1->encoding(), src2->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
emit_int16((unsigned char)0xBD, (0xC0 | encode));
}

void Assembler::vfmadd231ss(XMMRegister dst, XMMRegister src1, XMMRegister src2) {
assert(VM_Version::supports_fma(), "");
InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
Expand Down Expand Up @@ -6892,6 +6923,22 @@ void Assembler::vroundpd(XMMRegister dst, Address src, int32_t rmode, int vecto
emit_int8((rmode));
}

void Assembler::vroundsd(XMMRegister dst, XMMRegister src, XMMRegister src2, int32_t rmode) {
assert(VM_Version::supports_avx(), "");
assert(rmode <= 0x0f, "rmode 0x%x", rmode);
InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
int encode = vex_prefix_and_encode(dst->encoding(), src->encoding(), src2->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
emit_int24(0x0B, (0xC0 | encode), (rmode));
}

void Assembler::vrndscalesd(XMMRegister dst, XMMRegister src1, XMMRegister src2, int32_t rmode) {
assert(VM_Version::supports_evex(), "requires EVEX support");
InstructionAttr attributes(AVX_128bit, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
attributes.set_is_evex_instruction();
int encode = vex_prefix_and_encode(dst->encoding(), src1->encoding(), src2->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
emit_int24(0x0B, (0xC0 | encode), (rmode));
}

void Assembler::vrndscalepd(XMMRegister dst, XMMRegister src, int32_t rmode, int vector_len) {
assert(VM_Version::supports_evex(), "requires EVEX support");
InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true);
Expand Down Expand Up @@ -8857,6 +8904,19 @@ void Assembler::vextractf64x4(Address dst, XMMRegister src, uint8_t imm8) {
emit_int8(imm8 & 0x01);
}

void Assembler::extractps(Register dst, XMMRegister src, uint8_t imm8) {
assert(VM_Version::supports_sse4_1(), "");
assert(imm8 <= 0x03, "imm8: %u", imm8);
InstructionAttr attributes(AVX_128bit, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
int encode = simd_prefix_and_encode(src, xnoreg, as_XMMRegister(dst->encoding()), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
// imm8:
// 0x00 - extract from bits 31:0
// 0x01 - extract from bits 63:32
// 0x02 - extract from bits 95:64
// 0x03 - extract from bits 127:96
emit_int24(0x17, (0xC0 | encode), imm8 & 0x03);
}

// duplicate 1-byte integer data from src into programmed locations in dest : requires AVX512BW and AVX512VL
void Assembler::vpbroadcastb(XMMRegister dst, XMMRegister src, int vector_len) {
assert(VM_Version::supports_avx2(), "");
Expand Down Expand Up @@ -9531,6 +9591,15 @@ void Assembler::evdivpd(XMMRegister dst, KRegister mask, XMMRegister nds, Addres
emit_operand(dst, src, 0);
}

void Assembler::evdivsd(XMMRegister dst, XMMRegister nds, XMMRegister src, EvexRoundPrefix rmode) {
assert(VM_Version::supports_evex(), "");
InstructionAttr attributes(rmode, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
attributes.set_extended_context();
attributes.set_is_evex_instruction();
int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_F2, VEX_OPCODE_0F, &attributes);
emit_int16(0x5E, (0xC0 | encode));
}

void Assembler::evpabsb(XMMRegister dst, KRegister mask, XMMRegister src, bool merge, int vector_len) {
assert(VM_Version::supports_avx512bw() && (vector_len == AVX_512bit || VM_Version::supports_avx512vl()), "");
InstructionAttr attributes(vector_len, /* vex_w */ false,/* legacy_mode */ false, /* no_mask_reg */ false,/* uses_vl */ true);
Expand Down
20 changes: 20 additions & 0 deletions src/hotspot/cpu/x86/assembler_x86.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -528,6 +528,13 @@ class Assembler : public AbstractAssembler {
EVEX_Z = 0x80
};

enum EvexRoundPrefix {
EVEX_RNE = 0x0,
EVEX_RD = 0x1,
EVEX_RU = 0x2,
EVEX_RZ = 0x3
};

enum VexSimdPrefix {
VEX_SIMD_NONE = 0x0,
VEX_SIMD_66 = 0x1,
Expand Down Expand Up @@ -886,6 +893,8 @@ class Assembler : public AbstractAssembler {
void movsd(Address dst, XMMRegister src);
void movlpd(XMMRegister dst, Address src);

void vmovsd(XMMRegister dst, XMMRegister src, XMMRegister src2);

// New cpus require use of movaps and movapd to avoid partial register stall
// when moving between registers.
void movaps(XMMRegister dst, XMMRegister src);
Expand Down Expand Up @@ -2242,9 +2251,13 @@ class Assembler : public AbstractAssembler {
void vaddss(XMMRegister dst, XMMRegister nds, XMMRegister src);
void vdivsd(XMMRegister dst, XMMRegister nds, Address src);
void vdivsd(XMMRegister dst, XMMRegister nds, XMMRegister src);
void evdivsd(XMMRegister dst, XMMRegister nds, XMMRegister src, EvexRoundPrefix rmode);
void vdivss(XMMRegister dst, XMMRegister nds, Address src);
void vdivss(XMMRegister dst, XMMRegister nds, XMMRegister src);
void vfmadd231sd(XMMRegister dst, XMMRegister nds, XMMRegister src);
void vfnmadd213sd(XMMRegister dst, XMMRegister nds, XMMRegister src);
void evfnmadd213sd(XMMRegister dst, XMMRegister nds, XMMRegister src, EvexRoundPrefix rmode);
void vfnmadd231sd(XMMRegister dst, XMMRegister src1, XMMRegister src2);
void vfmadd231ss(XMMRegister dst, XMMRegister nds, XMMRegister src);
void vmulsd(XMMRegister dst, XMMRegister nds, Address src);
void vmulsd(XMMRegister dst, XMMRegister nds, XMMRegister src);
Expand Down Expand Up @@ -2334,8 +2347,11 @@ class Assembler : public AbstractAssembler {
// Round Packed Double precision value.
void vroundpd(XMMRegister dst, XMMRegister src, int32_t rmode, int vector_len);
void vroundpd(XMMRegister dst, Address src, int32_t rmode, int vector_len);
void vrndscalesd(XMMRegister dst, XMMRegister src1, XMMRegister src2, int32_t rmode);
void vrndscalepd(XMMRegister dst, XMMRegister src, int32_t rmode, int vector_len);
void vrndscalepd(XMMRegister dst, Address src, int32_t rmode, int vector_len);
void vroundsd(XMMRegister dst, XMMRegister src, XMMRegister src2, int32_t rmode);
void vroundsd(XMMRegister dst, XMMRegister src, Address src2, int32_t rmode);

// Bitwise Logical AND of Packed Floating-Point Values
void andpd(XMMRegister dst, XMMRegister src);
Expand Down Expand Up @@ -2719,6 +2735,8 @@ class Assembler : public AbstractAssembler {
void vextractf64x4(XMMRegister dst, XMMRegister src, uint8_t imm8);
void vextractf64x4(Address dst, XMMRegister src, uint8_t imm8);

void extractps(Register dst, XMMRegister src, uint8_t imm8);

// xmm/mem sourced byte/word/dword/qword replicate
void vpbroadcastb(XMMRegister dst, XMMRegister src, int vector_len);
void vpbroadcastb(XMMRegister dst, Address src, int vector_len);
Expand Down Expand Up @@ -2952,6 +2970,8 @@ class InstructionAttr {
_embedded_opmask_register_specifier = mask->encoding() & 0x7;
}

void set_extended_context(void) { _is_extended_context = true; }

};

#endif // CPU_X86_ASSEMBLER_X86_HPP
2 changes: 1 addition & 1 deletion src/hotspot/cpu/x86/c1_LIRGenerator_x86.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -968,7 +968,7 @@ void LIRGenerator::do_LibmIntrinsic(Intrinsic* x) {
break;
case vmIntrinsics::_dpow:
if (StubRoutines::dpow() != nullptr) {
__ call_runtime_leaf(StubRoutines::dpow(), getThreadTemp(), result_reg, cc->args());
__ call_runtime_leaf(StubRoutines::dpow(), getThreadTemp(), result_reg, cc->args());
} else {
__ call_runtime_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::dpow), getThreadTemp(), result_reg, cc->args());
}
Expand Down
18 changes: 18 additions & 0 deletions src/hotspot/cpu/x86/sharedRuntime_x86.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,8 @@ void SharedRuntime::inline_check_hashcode_from_object_header(MacroAssembler* mas
#if defined(TARGET_COMPILER_gcc) && !defined(_WIN64)
JRT_LEAF(jfloat, SharedRuntime::frem(jfloat x, jfloat y))
jfloat retval;
const bool is_LP64 = LP64_ONLY(true) NOT_LP64(false);
if (!is_LP64 || UseAVX < 1 || !UseFMA) {
asm ("\
1: \n\
fprem \n\
Expand All @@ -97,11 +99,21 @@ jne 1b \n\
:"=t"(retval)
:"0"(x), "u"(y)
:"cc", "ax");
} else {
assert(StubRoutines::fmod() != nullptr, "");
jdouble (*addr)(jdouble, jdouble) = (double (*)(double, double))StubRoutines::fmod();
jdouble dx = (jdouble) x;
jdouble dy = (jdouble) y;

retval = (jfloat) (*addr)(dx, dy);
}
return retval;
JRT_END

JRT_LEAF(jdouble, SharedRuntime::drem(jdouble x, jdouble y))
jdouble retval;
const bool is_LP64 = LP64_ONLY(true) NOT_LP64(false);
if (!is_LP64 || UseAVX < 1 || !UseFMA) {
asm ("\
1: \n\
fprem \n\
Expand All @@ -112,6 +124,12 @@ jne 1b \n\
:"=t"(retval)
:"0"(x), "u"(y)
:"cc", "ax");
} else {
assert(StubRoutines::fmod() != nullptr, "");
jdouble (*addr)(jdouble, jdouble) = (double (*)(double, double))StubRoutines::fmod();

retval = (*addr)(x, y);
}
return retval;
JRT_END
#endif // TARGET_COMPILER_gcc && !_WIN64
4 changes: 4 additions & 0 deletions src/hotspot/cpu/x86/stubGenerator_x86_64.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3937,6 +3937,10 @@ void StubGenerator::generate_initial_stubs() {
}

generate_libm_stubs();

if ((UseAVX >= 1) && (VM_Version::supports_avx512vlbwdq() || VM_Version::supports_fma())) {
StubRoutines::_fmod = generate_libmFmod(); // from stubGenerator_x86_64_fmod.cpp
}
}

void StubGenerator::generate_continuation_stubs() {
Expand Down
1 change: 1 addition & 0 deletions src/hotspot/cpu/x86/stubGenerator_x86_64.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -486,6 +486,7 @@ class StubGenerator: public StubCodeGenerator {
address generate_libmPow();
address generate_libmLog();
address generate_libmLog10();
address generate_libmFmod();

// Shared constants
static address ZERO;
Expand Down
Loading

0 comments on commit 5d5ae35

Please sign in to comment.