diff --git a/src/hotspot/cpu/x86/assembler_x86.cpp b/src/hotspot/cpu/x86/assembler_x86.cpp
index 2902abfc6619e..e76a8a49e0bf4 100644
--- a/src/hotspot/cpu/x86/assembler_x86.cpp
+++ b/src/hotspot/cpu/x86/assembler_x86.cpp
@@ -3555,6 +3555,14 @@ void Assembler::movsd(Address dst, XMMRegister src) {
   emit_operand(src, dst, 0);
 }
 
+void Assembler::vmovsd(XMMRegister dst, XMMRegister src, XMMRegister src2) {
+  assert(UseAVX > 0, "Requires some form of AVX");
+  InstructionMark im(this);
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
+  int encode = vex_prefix_and_encode(src2->encoding(), src->encoding(), dst->encoding(), VEX_SIMD_F2, VEX_OPCODE_0F, &attributes);
+  emit_int16(0x11, (0xC0 | encode));
+}
+
 void Assembler::movss(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse(), ""));
   InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
@@ -6531,6 +6539,29 @@ void Assembler::vfmadd231sd(XMMRegister dst, XMMRegister src1, XMMRegister src2)
   emit_int16((unsigned char)0xB9, (0xC0 | encode));
 }
 
+void Assembler::evfnmadd213sd(XMMRegister dst, XMMRegister src1, XMMRegister src2, EvexRoundPrefix rmode) { // Need to add rmode for rounding mode support
+  assert(VM_Version::supports_evex(), "");
+  InstructionAttr attributes(rmode, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
+  attributes.set_extended_context();
+  attributes.set_is_evex_instruction();
+  int encode = vex_prefix_and_encode(dst->encoding(), src1->encoding(), src2->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
+  emit_int16((unsigned char)0xAD, (0xC0 | encode));
+}
+
+void Assembler::vfnmadd213sd(XMMRegister dst, XMMRegister src1, XMMRegister src2) {
+  assert(VM_Version::supports_fma(), "");
+  InstructionAttr attributes(AVX_128bit, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
+  int encode = vex_prefix_and_encode(dst->encoding(), src1->encoding(), src2->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
+  emit_int16((unsigned char)0xAD, (0xC0 | encode));
+}
+
+void Assembler::vfnmadd231sd(XMMRegister dst, XMMRegister src1, XMMRegister src2) {
+  assert(VM_Version::supports_fma(), "");
+  InstructionAttr attributes(AVX_128bit, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
+  int encode = vex_prefix_and_encode(dst->encoding(), src1->encoding(), src2->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
+  emit_int16((unsigned char)0xBD, (0xC0 | encode));
+}
+
 void Assembler::vfmadd231ss(XMMRegister dst, XMMRegister src1, XMMRegister src2) {
   assert(VM_Version::supports_fma(), "");
   InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
@@ -6892,6 +6923,22 @@ void Assembler::vroundpd(XMMRegister dst, Address src, int32_t rmode,  int vecto
   emit_int8((rmode));
 }
 
+void Assembler::vroundsd(XMMRegister dst, XMMRegister src, XMMRegister src2, int32_t rmode) {
+  assert(VM_Version::supports_avx(), "");
+  assert(rmode <= 0x0f, "rmode 0x%x", rmode);
+  InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
+  int encode = vex_prefix_and_encode(dst->encoding(), src->encoding(), src2->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
+  emit_int24(0x0B, (0xC0 | encode), (rmode));
+}
+
+void Assembler::vrndscalesd(XMMRegister dst,  XMMRegister src1, XMMRegister src2, int32_t rmode) {
+  assert(VM_Version::supports_evex(), "requires EVEX support");
+  InstructionAttr attributes(AVX_128bit, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
+  attributes.set_is_evex_instruction();
+  int encode = vex_prefix_and_encode(dst->encoding(), src1->encoding(), src2->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
+  emit_int24(0x0B, (0xC0 | encode), (rmode));
+}
+
 void Assembler::vrndscalepd(XMMRegister dst,  XMMRegister src,  int32_t rmode, int vector_len) {
   assert(VM_Version::supports_evex(), "requires EVEX support");
   InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true);
@@ -8857,6 +8904,19 @@ void Assembler::vextractf64x4(Address dst, XMMRegister src, uint8_t imm8) {
   emit_int8(imm8 & 0x01);
 }
 
+void Assembler::extractps(Register dst, XMMRegister src, uint8_t imm8) {
+  assert(VM_Version::supports_sse4_1(), "");
+  assert(imm8 <= 0x03, "imm8: %u", imm8);
+  InstructionAttr attributes(AVX_128bit, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
+  int encode = simd_prefix_and_encode(src, xnoreg, as_XMMRegister(dst->encoding()), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
+  // imm8:
+  // 0x00 - extract from bits 31:0
+  // 0x01 - extract from bits 63:32
+  // 0x02 - extract from bits 95:64
+  // 0x03 - extract from bits 127:96
+  emit_int24(0x17, (0xC0 | encode), imm8 & 0x03);
+}
+
 // duplicate 1-byte integer data from src into programmed locations in dest : requires AVX512BW and AVX512VL
 void Assembler::vpbroadcastb(XMMRegister dst, XMMRegister src, int vector_len) {
   assert(VM_Version::supports_avx2(), "");
@@ -9531,6 +9591,15 @@ void Assembler::evdivpd(XMMRegister dst, KRegister mask, XMMRegister nds, Addres
   emit_operand(dst, src, 0);
 }
 
+void Assembler::evdivsd(XMMRegister dst, XMMRegister nds, XMMRegister src, EvexRoundPrefix rmode) {
+  assert(VM_Version::supports_evex(), "");
+  InstructionAttr attributes(rmode, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
+  attributes.set_extended_context();
+  attributes.set_is_evex_instruction();
+  int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_F2, VEX_OPCODE_0F, &attributes);
+  emit_int16(0x5E, (0xC0 | encode));
+}
+
 void Assembler::evpabsb(XMMRegister dst, KRegister mask, XMMRegister src, bool merge, int vector_len) {
   assert(VM_Version::supports_avx512bw() && (vector_len == AVX_512bit || VM_Version::supports_avx512vl()), "");
   InstructionAttr attributes(vector_len, /* vex_w */ false,/* legacy_mode */ false, /* no_mask_reg */ false,/* uses_vl */ true);
diff --git a/src/hotspot/cpu/x86/assembler_x86.hpp b/src/hotspot/cpu/x86/assembler_x86.hpp
index 5102e2c384925..60883f13f7a79 100644
--- a/src/hotspot/cpu/x86/assembler_x86.hpp
+++ b/src/hotspot/cpu/x86/assembler_x86.hpp
@@ -528,6 +528,13 @@ class Assembler : public AbstractAssembler  {
     EVEX_Z  = 0x80
   };
 
+  enum EvexRoundPrefix {
+    EVEX_RNE = 0x0,
+    EVEX_RD  = 0x1,
+    EVEX_RU  = 0x2,
+    EVEX_RZ  = 0x3
+  };
+
   enum VexSimdPrefix {
     VEX_SIMD_NONE = 0x0,
     VEX_SIMD_66   = 0x1,
@@ -886,6 +893,8 @@ class Assembler : public AbstractAssembler  {
   void movsd(Address dst, XMMRegister src);
   void movlpd(XMMRegister dst, Address src);
 
+  void vmovsd(XMMRegister dst, XMMRegister src, XMMRegister src2);
+
   // New cpus require use of movaps and movapd to avoid partial register stall
   // when moving between registers.
   void movaps(XMMRegister dst, XMMRegister src);
@@ -2242,9 +2251,13 @@ class Assembler : public AbstractAssembler  {
   void vaddss(XMMRegister dst, XMMRegister nds, XMMRegister src);
   void vdivsd(XMMRegister dst, XMMRegister nds, Address src);
   void vdivsd(XMMRegister dst, XMMRegister nds, XMMRegister src);
+  void evdivsd(XMMRegister dst, XMMRegister nds, XMMRegister src, EvexRoundPrefix rmode);
   void vdivss(XMMRegister dst, XMMRegister nds, Address src);
   void vdivss(XMMRegister dst, XMMRegister nds, XMMRegister src);
   void vfmadd231sd(XMMRegister dst, XMMRegister nds, XMMRegister src);
+  void vfnmadd213sd(XMMRegister dst, XMMRegister nds, XMMRegister src);
+  void evfnmadd213sd(XMMRegister dst, XMMRegister nds, XMMRegister src, EvexRoundPrefix rmode);
+  void vfnmadd231sd(XMMRegister dst, XMMRegister src1, XMMRegister src2);
   void vfmadd231ss(XMMRegister dst, XMMRegister nds, XMMRegister src);
   void vmulsd(XMMRegister dst, XMMRegister nds, Address src);
   void vmulsd(XMMRegister dst, XMMRegister nds, XMMRegister src);
@@ -2334,8 +2347,11 @@ class Assembler : public AbstractAssembler  {
   // Round Packed Double precision value.
   void vroundpd(XMMRegister dst, XMMRegister src, int32_t rmode, int vector_len);
   void vroundpd(XMMRegister dst, Address src, int32_t rmode, int vector_len);
+  void vrndscalesd(XMMRegister dst,  XMMRegister src1,  XMMRegister src2, int32_t rmode);
   void vrndscalepd(XMMRegister dst,  XMMRegister src,  int32_t rmode, int vector_len);
   void vrndscalepd(XMMRegister dst, Address src, int32_t rmode, int vector_len);
+  void vroundsd(XMMRegister dst, XMMRegister src, XMMRegister src2, int32_t rmode);
+  void vroundsd(XMMRegister dst, XMMRegister src, Address src2, int32_t rmode);
 
   // Bitwise Logical AND of Packed Floating-Point Values
   void andpd(XMMRegister dst, XMMRegister src);
@@ -2719,6 +2735,8 @@ class Assembler : public AbstractAssembler  {
   void vextractf64x4(XMMRegister dst, XMMRegister src, uint8_t imm8);
   void vextractf64x4(Address dst, XMMRegister src, uint8_t imm8);
 
+  void extractps(Register dst, XMMRegister src, uint8_t imm8);
+
   // xmm/mem sourced byte/word/dword/qword replicate
   void vpbroadcastb(XMMRegister dst, XMMRegister src, int vector_len);
   void vpbroadcastb(XMMRegister dst, Address src, int vector_len);
@@ -2952,6 +2970,8 @@ class InstructionAttr {
     _embedded_opmask_register_specifier = mask->encoding() & 0x7;
   }
 
+  void set_extended_context(void) { _is_extended_context = true; }
+
 };
 
 #endif // CPU_X86_ASSEMBLER_X86_HPP
diff --git a/src/hotspot/cpu/x86/c1_LIRGenerator_x86.cpp b/src/hotspot/cpu/x86/c1_LIRGenerator_x86.cpp
index db332274a68b6..b6a27abf0f37e 100644
--- a/src/hotspot/cpu/x86/c1_LIRGenerator_x86.cpp
+++ b/src/hotspot/cpu/x86/c1_LIRGenerator_x86.cpp
@@ -968,7 +968,7 @@ void LIRGenerator::do_LibmIntrinsic(Intrinsic* x) {
       break;
     case vmIntrinsics::_dpow:
        if (StubRoutines::dpow() != nullptr) {
-      __ call_runtime_leaf(StubRoutines::dpow(), getThreadTemp(), result_reg, cc->args());
+        __ call_runtime_leaf(StubRoutines::dpow(), getThreadTemp(), result_reg, cc->args());
       } else {
         __ call_runtime_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::dpow), getThreadTemp(), result_reg, cc->args());
       }
diff --git a/src/hotspot/cpu/x86/sharedRuntime_x86.cpp b/src/hotspot/cpu/x86/sharedRuntime_x86.cpp
index 1d13b710430e8..d39cab092f8f3 100644
--- a/src/hotspot/cpu/x86/sharedRuntime_x86.cpp
+++ b/src/hotspot/cpu/x86/sharedRuntime_x86.cpp
@@ -87,6 +87,8 @@ void SharedRuntime::inline_check_hashcode_from_object_header(MacroAssembler* mas
 #if defined(TARGET_COMPILER_gcc) && !defined(_WIN64)
 JRT_LEAF(jfloat, SharedRuntime::frem(jfloat x, jfloat y))
   jfloat retval;
+  const bool is_LP64 = LP64_ONLY(true) NOT_LP64(false);
+  if (!is_LP64 || UseAVX < 1 || !UseFMA) {
   asm ("\
 1:               \n\
 fprem            \n\
@@ -97,11 +99,21 @@ jne    1b        \n\
     :"=t"(retval)
     :"0"(x), "u"(y)
     :"cc", "ax");
+  } else {
+    assert(StubRoutines::fmod() != nullptr, "");
+    jdouble (*addr)(jdouble, jdouble) = (double (*)(double, double))StubRoutines::fmod();
+    jdouble dx = (jdouble) x;
+    jdouble dy = (jdouble) y;
+
+    retval = (jfloat) (*addr)(dx, dy);
+  }
   return retval;
 JRT_END
 
 JRT_LEAF(jdouble, SharedRuntime::drem(jdouble x, jdouble y))
   jdouble retval;
+  const bool is_LP64 = LP64_ONLY(true) NOT_LP64(false);
+  if (!is_LP64 || UseAVX < 1 || !UseFMA) {
   asm ("\
 1:               \n\
 fprem            \n\
@@ -112,6 +124,12 @@ jne    1b        \n\
     :"=t"(retval)
     :"0"(x), "u"(y)
     :"cc", "ax");
+  } else {
+    assert(StubRoutines::fmod() != nullptr, "");
+    jdouble (*addr)(jdouble, jdouble) = (double (*)(double, double))StubRoutines::fmod();
+
+    retval = (*addr)(x, y);
+  }
   return retval;
 JRT_END
 #endif // TARGET_COMPILER_gcc && !_WIN64
diff --git a/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp b/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp
index 6cd1765151492..2c100da4a5dbe 100644
--- a/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp
+++ b/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp
@@ -3937,6 +3937,10 @@ void StubGenerator::generate_initial_stubs() {
   }
 
   generate_libm_stubs();
+
+  if ((UseAVX >= 1) && (VM_Version::supports_avx512vlbwdq() || VM_Version::supports_fma())) {
+    StubRoutines::_fmod = generate_libmFmod(); // from stubGenerator_x86_64_fmod.cpp
+  }
 }
 
 void StubGenerator::generate_continuation_stubs() {
diff --git a/src/hotspot/cpu/x86/stubGenerator_x86_64.hpp b/src/hotspot/cpu/x86/stubGenerator_x86_64.hpp
index 86e3b169554d2..e7cc8e3b40a40 100644
--- a/src/hotspot/cpu/x86/stubGenerator_x86_64.hpp
+++ b/src/hotspot/cpu/x86/stubGenerator_x86_64.hpp
@@ -486,6 +486,7 @@ class StubGenerator: public StubCodeGenerator {
   address generate_libmPow();
   address generate_libmLog();
   address generate_libmLog10();
+  address generate_libmFmod();
 
   // Shared constants
   static address ZERO;
diff --git a/src/hotspot/cpu/x86/stubGenerator_x86_64_fmod.cpp b/src/hotspot/cpu/x86/stubGenerator_x86_64_fmod.cpp
new file mode 100644
index 0000000000000..04ad300ddcd87
--- /dev/null
+++ b/src/hotspot/cpu/x86/stubGenerator_x86_64_fmod.cpp
@@ -0,0 +1,524 @@
+/*
+ * Copyright (c) 2023, Intel Corporation. All rights reserved.
+ * Intel Math Library (LIBM) Source Code
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#include "precompiled.hpp"
+#include "macroAssembler_x86.hpp"
+#include "stubGenerator_x86_64.hpp"
+
+/******************************************************************************/
+//                     ALGORITHM DESCRIPTION - FMOD()
+//                     ---------------------
+//
+// If either value1 or value2 is NaN, the result is NaN.
+//
+// If neither value1 nor value2 is NaN, the sign of the result equals the sign of the dividend.
+//
+// If the dividend is an infinity or the divisor is a zero or both, the result is NaN.
+//
+// If the dividend is finite and the divisor is an infinity, the result equals the dividend.
+//
+// If the dividend is a zero and the divisor is finite, the result equals the dividend.
+//
+// In the remaining cases, where neither operand is an infinity, a zero, or NaN, the floating-point
+// remainder result from a dividend value1 and a divisor value2 is defined by the mathematical
+// relation result = value1 - (value2 * q), where q is an integer that is negative only if
+// value1 / value2 is negative, and positive only if value1 / value2 is positive, and whose magnitude
+// is as large as possible without exceeding the magnitude of the true mathematical quotient of value1 and value2.
+//
+/******************************************************************************/
+
+#define __ _masm->
+
+ATTRIBUTE_ALIGNED(32) static const uint64_t CONST_NaN[] = {
+    0x7FFFFFFFFFFFFFFFULL, 0x7FFFFFFFFFFFFFFFULL   // NaN vector
+};
+ATTRIBUTE_ALIGNED(32) static const uint64_t CONST_1p260[] = {
+    0x5030000000000000ULL,    // 0x1p+260
+};
+
+ATTRIBUTE_ALIGNED(32) static const uint64_t CONST_MAX[] = {
+    0x7FEFFFFFFFFFFFFFULL,    // Max
+};
+
+ATTRIBUTE_ALIGNED(32) static const uint64_t CONST_INF[] = {
+    0x7FF0000000000000ULL,    // Inf
+};
+
+ATTRIBUTE_ALIGNED(32) static const uint64_t CONST_e307[] = {
+    0x7FE0000000000000ULL
+};
+
+address StubGenerator::generate_libmFmod() {
+  StubCodeMark mark(this, "StubRoutines", "libmFmod");
+  address start = __ pc();
+  __ enter(); // required for proper stackwalking of RuntimeStub frame
+
+  if (VM_Version::supports_avx512vlbwdq()) {     // AVX512 version
+
+    // Source used to generate the AVX512 fmod assembly below:
+    //
+    // #include <ia32intrin.h>
+    // #include <emmintrin.h>
+    // #pragma float_control(precise, on)
+    //
+    // #define UINT32 unsigned int
+    // #define SINT32 int
+    // #define UINT64 unsigned __int64
+    // #define SINT64 __int64
+    //
+    // #define DP_FMA(a, b, c)    __fence(_mm_cvtsd_f64(_mm_fmadd_sd(_mm_set_sd(a), _mm_set_sd(b), _mm_set_sd(c))))
+    // #define DP_FMA_RN(a, b, c)    _mm_cvtsd_f64(_mm_fmadd_round_sd(_mm_set_sd(a), _mm_set_sd(b), _mm_set_sd(c), (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)))
+    // #define DP_FMA_RZ(a, b, c) __fence(_mm_cvtsd_f64(_mm_fmadd_round_sd(_mm_set_sd(a), _mm_set_sd(b), _mm_set_sd(c), (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC))))
+    //
+    // #define DP_ROUND_RZ(a)   _mm_cvtsd_f64(_mm_roundscale_sd(_mm_setzero_pd(), _mm_set_sd(a), (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)))
+    //
+    // #define DP_CONST(C)    _castu64_f64(0x##C##ull)
+    // #define DP_AND(X, Y)   _mm_cvtsd_f64(_mm_and_pd(_mm_set_sd(X), _mm_set_sd(Y)))
+    // #define DP_XOR(X, Y)   _mm_cvtsd_f64(_mm_xor_pd(_mm_set_sd(X), _mm_set_sd(Y)))
+    // #define DP_OR(X, Y)    _mm_cvtsd_f64(_mm_or_pd(_mm_set_sd(X), _mm_set_sd(Y)))
+    // #define DP_DIV_RZ(a, b) __fence(_mm_cvtsd_f64(_mm_div_round_sd(_mm_set_sd(a), _mm_set_sd(b), (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC))))
+    // #define DP_FNMA(a, b, c)    __fence(_mm_cvtsd_f64(_mm_fnmadd_sd(_mm_set_sd(a), _mm_set_sd(b), _mm_set_sd(c))))
+    // #define DP_FNMA_RZ(a, b, c) __fence(_mm_cvtsd_f64(_mm_fnmadd_round_sd(_mm_set_sd(a), _mm_set_sd(b), _mm_set_sd(c), (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC))))
+    //
+    // #define D2L(x)  _mm_castpd_si128(x)
+    // // transfer highest 32 bits (of low 64b) to GPR
+    // #define TRANSFER_HIGH_INT32(X)   _mm_extract_epi32(D2L(_mm_set_sd(X)), 1)
+    //
+    // double fmod(double x, double y)
+    // {
+    // double a, b, sgn_a, q, bs, bs2;
+    // unsigned eq;
+
+    Label L_5280, L_52a0, L_5256, L_5300, L_5320, L_52c0, L_52d0, L_5360, L_5380, L_53b0, L_5390;
+    Label L_53c0, L_52a6, L_53d0, L_exit;
+
+    __ movdqa(xmm2, xmm0);
+    //     // |x|, |y|
+    //     a = DP_AND(x, DP_CONST(7fffffffffffffff));
+    __ movq(xmm0, xmm0);
+    __ mov64(rax, 0x7FFFFFFFFFFFFFFFULL);
+    __ evpbroadcastq(xmm3, rax, Assembler::AVX_128bit);
+    __ vpand(xmm6, xmm0, xmm3, Assembler::AVX_128bit);
+    //     b = DP_AND(y, DP_CONST(7fffffffffffffff));
+    __ vpand(xmm4, xmm1, xmm3, Assembler::AVX_128bit);
+    //     // sign(x)
+    //     sgn_a = DP_XOR(x, a);
+    __ vpxor(xmm3, xmm6, xmm0, Assembler::AVX_128bit);
+    //     q = DP_DIV_RZ(a, b);
+    __ movq(xmm5, xmm4);
+    __ evdivsd(xmm0, xmm6, xmm5, Assembler::EVEX_RZ);
+    //     q = DP_ROUND_RZ(q);
+    __ movq(xmm0, xmm0);
+    //     a = DP_AND(x, DP_CONST(7fffffffffffffff));
+    __ vxorpd(xmm7, xmm7, xmm7, Assembler::AVX_128bit);
+    //     q = DP_ROUND_RZ(q);
+    __ vroundsd(xmm0, xmm7, xmm0, 0xb);
+    //     eq = TRANSFER_HIGH_INT32(q);
+    __ extractps(rax, xmm0, 1);
+    //     if (!eq)  return x + sgn_a;
+    __ testl(rax, rax);
+    __ jcc(Assembler::equal, L_5280);
+    //     if (eq >= 0x7fefffffu) goto SPECIAL_FMOD;
+    __ cmpl(rax, 0x7feffffe);
+    __ jcc(Assembler::belowEqual, L_52a0);
+    __ vpxor(xmm2, xmm2, xmm2, Assembler::AVX_128bit);
+    // SPECIAL_FMOD:
+    //
+    //     // y==0 or x==Inf?
+    //     if ((b == 0.0) || (!(a <= DP_CONST(7fefffffffffffff))))
+    __ ucomisd(xmm4, xmm2);
+    __ jcc(Assembler::notEqual, L_5256);
+    __ jcc(Assembler::noParity, L_5300);
+    __ bind(L_5256);
+    __ movsd(xmm2, ExternalAddress((address)CONST_MAX), rax);
+    __ ucomisd(xmm2, xmm6);
+    __ jcc(Assembler::below, L_5300);
+    __ movsd(xmm0, ExternalAddress((address)CONST_INF), rax);
+    //         return DP_FNMA(b, q, a);    // NaN
+    //     // y is NaN?
+    //     if (!(b <= DP_CONST(7ff0000000000000))) return y + y;
+    __ ucomisd(xmm0, xmm4);
+    __ jcc(Assembler::aboveEqual, L_5320);
+    __ vaddsd(xmm0, xmm1, xmm1);
+    __ jmp(L_exit);
+    //     if (!eq)  return x + sgn_a;
+    __ align32();
+    __ bind(L_5280);
+    __ vaddsd(xmm0, xmm3, xmm2);
+    __ jmp(L_exit);
+    //     a = DP_FNMA_RZ(b, q, a);
+    __ align(8);
+    __ bind(L_52a0);
+    __ evfnmadd213sd(xmm0, xmm4, xmm6, Assembler::EVEX_RZ);
+    //     while (b <= a)
+    __ bind(L_52a6);
+    __ ucomisd(xmm0, xmm4);
+    __ jcc(Assembler::aboveEqual, L_52c0);
+    //     a = DP_XOR(a, sgn_a);
+    __ vpxor(xmm0, xmm3, xmm0, Assembler::AVX_128bit);
+    __ jmp(L_exit);
+    __ bind(L_52c0);
+    __ movq(xmm6, xmm0);
+    //         q = DP_ROUND_RZ(q);
+    __ vpxor(xmm1, xmm1, xmm1, Assembler::AVX_128bit);
+    __ align32();
+    __ bind(L_52d0);
+    //         q = DP_DIV_RZ(a, b);
+    __ evdivsd(xmm2, xmm6, xmm5, Assembler::EVEX_RZ);
+    //         q = DP_ROUND_RZ(q);
+    __ movq(xmm2, xmm2);
+    __ vroundsd(xmm2, xmm1, xmm2, 0xb);
+    //     a = DP_FNMA_RZ(b, q, a);
+    __ evfnmadd213sd(xmm2, xmm4, xmm0, Assembler::EVEX_RZ);
+    //     while (b <= a)
+    __ ucomisd(xmm2, xmm4);
+    __ movq(xmm6, xmm2);
+    __ movapd(xmm0, xmm2);
+    __ jcc(Assembler::aboveEqual, L_52d0);
+    //     a = DP_XOR(a, sgn_a);
+    __ vpxor(xmm0, xmm3, xmm2, Assembler::AVX_128bit);
+    __ jmp(L_exit);
+    //         return DP_FNMA(b, q, a);    // NaN
+    __ bind(L_5300);
+    __ vfnmadd213sd(xmm0, xmm4, xmm6);
+    __ jmp(L_exit);
+    //     bs = b * DP_CONST(7fe0000000000000);
+    __ bind(L_5320);
+    __ vmulsd(xmm1, xmm4, ExternalAddress((address)CONST_e307), rax);
+    //     q = DP_DIV_RZ(a, bs);
+    __ movq(xmm2, xmm1);
+    __ evdivsd(xmm0, xmm6, xmm2, Assembler::EVEX_RZ);
+    //     q = DP_ROUND_RZ(q);
+    __ movq(xmm0, xmm0);
+    __ vroundsd(xmm7, xmm7, xmm0, 0xb);
+    //     eq = TRANSFER_HIGH_INT32(q);
+    __ extractps(rax, xmm7, 1);
+    //     if (eq >= 0x7fefffffu)
+    __ cmpl(rax, 0x7fefffff);
+    __ jcc(Assembler::below, L_5360);
+    //         // b* 2*1023 * 2^1023
+    //         bs2 = bs * DP_CONST(7fe0000000000000);
+    __ vmulsd(xmm0, xmm1, ExternalAddress((address)CONST_e307), rax);
+    //         while (bs2 <= a)
+    __ ucomisd(xmm6, xmm0);
+    __ jcc(Assembler::aboveEqual, L_5380);
+    __ movapd(xmm7, xmm6);
+    __ jmp(L_53b0);
+    //         a = DP_FNMA_RZ(b, q, a);
+    __ bind(L_5360);
+    __ evfnmadd213sd(xmm7, xmm1, xmm6, Assembler::EVEX_RZ);
+    __ jmp(L_53b0);
+    //             q = DP_ROUND_RZ(q);
+    __ bind(L_5380);
+    __ vxorpd(xmm8, xmm8, xmm8, Assembler::AVX_128bit);
+    //             q = DP_DIV_RZ(qa, bs2);
+    __ align32();
+    __ bind(L_5390);
+    __ evdivsd(xmm7, xmm6, xmm0, Assembler::EVEX_RZ);
+    //             q = DP_ROUND_RZ(q);
+    __ movq(xmm7, xmm7);
+    __ vroundsd(xmm7, xmm8, xmm7, 0xb);
+    //             a = DP_FNMA_RZ(bs2, q, a);
+    __ evfnmadd213sd(xmm7, xmm0, xmm6, Assembler::EVEX_RZ);
+    //         while (bs2 <= a)
+    __ ucomisd(xmm7, xmm0);
+    __ movapd(xmm6, xmm7);
+    __ jcc(Assembler::aboveEqual, L_5390);
+    //     while (bs <= a)
+    __ bind(L_53b0);
+    __ ucomisd(xmm7, xmm1);
+    __ jcc(Assembler::aboveEqual, L_53c0);
+    __ movapd(xmm0, xmm7);
+    __ jmp(L_52a6);
+    //         q = DP_ROUND_RZ(q);
+    __ bind(L_53c0);
+    __ vxorpd(xmm6, xmm6, xmm6, Assembler::AVX_128bit);
+    //         q = DP_DIV_RZ(a, bs);
+    __ align32();
+    __ bind(L_53d0);
+    __ evdivsd(xmm0, xmm7, xmm2, Assembler::EVEX_RZ);
+    //         q = DP_ROUND_RZ(q);
+    __ movq(xmm0, xmm0);
+    __ vroundsd(xmm0, xmm6, xmm0, 0xb);
+    //         a = DP_FNMA_RZ(bs, q, a);
+    __ evfnmadd213sd(xmm0, xmm1, xmm7, Assembler::EVEX_RZ);
+    //     while (bs <= a)
+    __ ucomisd(xmm0, xmm1);
+    __ movapd(xmm7, xmm0);
+    __ jcc(Assembler::aboveEqual, L_53d0);
+    __ jmp(L_52a6);
+
+    __ bind(L_exit);
+
+////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////
+//                         AVX2 code
+////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////
+  } else if (VM_Version::supports_fma()) {       // AVX2 version
+
+    Label L_104a, L_11bd, L_10c1, L_1090, L_11b9, L_10e7, L_11af, L_111c, L_10f3, L_116e, L_112a;
+    Label L_1173, L_1157, L_117f, L_11a0;
+
+    //   double fmod(double x, double y)
+    // {
+    // double a, b, sgn_a, q, bs, bs2, corr, res;
+    // unsigned eq;
+    // unsigned mxcsr, mxcsr_rz;
+
+    //   __asm { stmxcsr DWORD PTR[mxcsr] }
+    //   mxcsr_rz = 0x7f80 | mxcsr;
+    __ push(rax);
+    __ stmxcsr(Address(rsp, 0));
+    __ movl(rax, Address(rsp, 0));
+    __ movl(rcx, rax);
+    __ orl(rcx, 0x7f80);
+    __ movl(Address(rsp, 0x04), rcx);
+
+    //     // |x|, |y|
+    //     a = DP_AND(x, DP_CONST(7fffffffffffffff));
+    __ movq(xmm2, xmm0);
+    __ vmovdqu(xmm3, ExternalAddress((address)CONST_NaN), rcx);
+    __ vpand(xmm4, xmm2, xmm3, Assembler::AVX_128bit);
+    //     b = DP_AND(y, DP_CONST(7fffffffffffffff));
+    __ vpand(xmm3, xmm1, xmm3, Assembler::AVX_128bit);
+    //   // sign(x)
+    //   sgn_a = DP_XOR(x, a);
+    __ mov64(rcx, 0x8000000000000000ULL);
+    __ movq(xmm5, rcx);
+    __ vpand(xmm2, xmm2, xmm5, Assembler::AVX_128bit);
+
+    //   if (a < b)  return x + sgn_a;
+    __ ucomisd(xmm3, xmm4);
+    __ jcc(Assembler::belowEqual, L_104a);
+    __ vaddsd(xmm0, xmm2, xmm0);
+    __ jmp(L_11bd);
+
+    //   if (((mxcsr & 0x6000)!=0x2000) && (a < b * 0x1p+260))
+    __ bind(L_104a);
+    __ andl(rax, 0x6000);
+    __ cmpl(rax, 0x2000);
+    __ jcc(Assembler::equal, L_10c1);
+    __ vmulsd(xmm0, xmm3, ExternalAddress((address)CONST_1p260), rax);
+    __ ucomisd(xmm0, xmm4);
+    __ jcc(Assembler::belowEqual, L_10c1);
+    //   {
+    //     q = DP_DIV(a, b);
+    __ vdivpd(xmm0, xmm4, xmm3, Assembler::AVX_128bit);
+    //     corr = DP_SHR(DP_FNMA(b, q, a), 63);
+    __ movapd(xmm1, xmm0);
+    __ vfnmadd213sd(xmm1, xmm3, xmm4);
+    __ movq(xmm5, xmm1);
+    __ vpxor(xmm1, xmm1, xmm1, Assembler::AVX_128bit);
+    __ vpcmpgtq(xmm5, xmm1, xmm5, Assembler::AVX_128bit);
+    //     q = DP_PSUBQ(q, corr);
+    __ vpaddq(xmm0, xmm5, xmm0, Assembler::AVX_128bit);
+    //     q = DP_TRUNC(q);
+    __ vroundsd(xmm0, xmm0, xmm0, 3);
+    //     a = DP_FNMA(b, q, a);
+    __ vfnmadd213sd(xmm0, xmm3, xmm4);
+    __ align32();
+    //     while (b <= a)
+    __ bind(L_1090);
+    __ ucomisd(xmm0, xmm3);
+    __ jcc(Assembler::below, L_11b9);
+    //     {
+    //       q = DP_DIV(a, b);
+    __ vdivsd(xmm4, xmm0, xmm3);
+    //       corr = DP_SHR(DP_FNMA(b, q, a), 63);
+    __ movapd(xmm5, xmm4);
+    __ vfnmadd213sd(xmm5, xmm3, xmm0);
+    __ movq(xmm5, xmm5);
+    __ vpcmpgtq(xmm5, xmm1, xmm5, Assembler::AVX_128bit);
+    //       q = DP_PSUBQ(q, corr);
+    __ vpaddq(xmm4, xmm5, xmm4, Assembler::AVX_128bit);
+    //       q = DP_TRUNC(q);
+    __ vroundsd(xmm4, xmm4, xmm4, 3);
+    //       a = DP_FNMA(b, q, a);
+    __ vfnmadd231sd(xmm0, xmm3, xmm4);
+    __ jmp(L_1090);
+    //     }
+    //     return DP_XOR(a, sgn_a);
+    //   }
+
+    //   __asm { ldmxcsr DWORD PTR [mxcsr_rz] }
+    __ bind(L_10c1);
+    __ ldmxcsr(Address(rsp, 0x04));
+
+    //   q = DP_DIV(a, b);
+    __ vdivpd(xmm0, xmm4, xmm3, Assembler::AVX_128bit);
+    //   q = DP_TRUNC(q);
+    __ vroundsd(xmm0, xmm0, xmm0, 3);
+
+    //   eq = TRANSFER_HIGH_INT32(q);
+    __ extractps(rax, xmm0, 1);
+
+    //   if (__builtin_expect((eq >= 0x7fefffffu), (0==1))) goto SPECIAL_FMOD;
+    __ cmpl(rax, 0x7feffffe);
+    __ jcc(Assembler::above, L_10e7);
+
+    //   a = DP_FNMA(b, q, a);
+    __ vfnmadd213sd(xmm0, xmm3, xmm4);
+    __ jmp(L_11af);
+
+    // SPECIAL_FMOD:
+
+    //   // y==0 or x==Inf?
+    //   if ((b == 0.0) || (!(a <= DP_CONST(7fefffffffffffff))))
+    __ bind(L_10e7);
+    __ vpxor(xmm5, xmm5, xmm5, Assembler::AVX_128bit);
+    __ ucomisd(xmm3, xmm5);
+    __ jcc(Assembler::notEqual, L_10f3);
+    __ jcc(Assembler::noParity, L_111c);
+
+    __ bind(L_10f3);
+    __ movsd(xmm5, ExternalAddress((address)CONST_MAX), rax);
+    __ ucomisd(xmm5, xmm4);
+    __ jcc(Assembler::below, L_111c);
+    //     return res;
+    //   }
+    //   // y is NaN?
+    //   if (!(b <= DP_CONST(7ff0000000000000))) {
+    __ movsd(xmm0, ExternalAddress((address)CONST_INF), rax);
+    __ ucomisd(xmm0, xmm3);
+    __ jcc(Assembler::aboveEqual, L_112a);
+    //     res = y + y;
+    __ vaddsd(xmm0, xmm1, xmm1);
+    //     __asm { ldmxcsr DWORD PTR[mxcsr] }
+    __ ldmxcsr(Address(rsp, 0));
+    __ jmp(L_11bd);
+    //   {
+    //     res = DP_FNMA(b, q, a);    // NaN
+    __ bind(L_111c);
+    __ vfnmadd213sd(xmm0, xmm3, xmm4);
+    //     __asm { ldmxcsr DWORD PTR[mxcsr] }
+    __ ldmxcsr(Address(rsp, 0));
+    __ jmp(L_11bd);
+    //     return res;
+    //   }
+
+    //   // b* 2*1023
+    //   bs = b * DP_CONST(7fe0000000000000);
+    __ bind(L_112a);
+    __ vmulsd(xmm1, xmm3, ExternalAddress((address)CONST_e307), rax);
+
+    //   q = DP_DIV(a, bs);
+    __ vdivsd(xmm0, xmm4, xmm1);
+    //   q = DP_TRUNC(q);
+    __ vroundsd(xmm0, xmm0, xmm0, 3);
+
+    //   eq = TRANSFER_HIGH_INT32(q);
+    __ extractps(rax, xmm0, 1);
+
+    //   if (eq >= 0x7fefffffu)
+    __ cmpl(rax, 0x7fefffff);
+    __ jcc(Assembler::below, L_116e);
+    //   {
+    //     // b* 2*1023 * 2^1023
+    //     bs2 = bs * DP_CONST(7fe0000000000000);
+    __ vmulsd(xmm0, xmm1, ExternalAddress((address)CONST_e307), rax);
+    //     while (bs2 <= a)
+    __ ucomisd(xmm4, xmm0);
+    __ jcc(Assembler::below, L_1173);
+    //     {
+    //       q = DP_DIV(a, bs2);
+    __ bind(L_1157);
+    __ vdivsd(xmm5, xmm4, xmm0);
+    //       q = DP_TRUNC(q);
+    __ vroundsd(xmm5, xmm5, xmm5, 3);
+    //       a = DP_FNMA(bs2, q, a);
+    __ vfnmadd231sd(xmm4, xmm0, xmm5);
+    //     while (bs2 <= a)
+    __ ucomisd(xmm4, xmm0);
+    __ jcc(Assembler::aboveEqual, L_1157);
+    __ jmp(L_1173);
+    //     }
+    //   }
+    //   else
+    //   a = DP_FNMA(bs, q, a);
+    __ bind(L_116e);
+    __ vfnmadd231sd(xmm4, xmm1, xmm0);
+
+    //   while (bs <= a)
+    __ bind(L_1173);
+    __ ucomisd(xmm4, xmm1);
+    __ jcc(Assembler::aboveEqual, L_117f);
+    __ movapd(xmm0, xmm4);
+    __ jmp(L_11af);
+    //   {
+    //     q = DP_DIV(a, bs);
+    __ bind(L_117f);
+    __ vdivsd(xmm0, xmm4, xmm1);
+    //     q = DP_TRUNC(q);
+    __ vroundsd(xmm0, xmm0, xmm0, 3);
+    //     a = DP_FNMA(bs, q, a);
+    __ vfnmadd213sd(xmm0, xmm1, xmm4);
+
+    //   while (bs <= a)
+    __ ucomisd(xmm0, xmm1);
+    __ movapd(xmm4, xmm0);
+    __ jcc(Assembler::aboveEqual, L_117f);
+    __ jmp(L_11af);
+    __ align32();
+    //   {
+    //     q = DP_DIV(a, b);
+    __ bind(L_11a0);
+    __ vdivsd(xmm1, xmm0, xmm3);
+    //     q = DP_TRUNC(q);
+    __ vroundsd(xmm1, xmm1, xmm1, 3);
+    //     a = DP_FNMA(b, q, a);
+    __ vfnmadd231sd(xmm0, xmm3, xmm1);
+
+    // FMOD_CONT:
+    //   while (b <= a)
+    __ bind(L_11af);
+    __ ucomisd(xmm0, xmm3);
+    __ jcc(Assembler::aboveEqual, L_11a0);
+    //   }
+
+    //   __asm { ldmxcsr DWORD PTR[mxcsr] }
+    __ ldmxcsr(Address(rsp, 0));
+    __ bind(L_11b9);
+    __ vpxor(xmm0, xmm2, xmm0, Assembler::AVX_128bit);
+    //   }
+
+    //   goto FMOD_CONT;
+
+    // }
+    __ bind(L_11bd);
+    __ pop(rax);
+
+  } else {                                       // SSE version
+    assert(false, "SSE not implemented");
+  }
+
+  __ leave(); // required for proper stackwalking of RuntimeStub frame
+  __ ret(0);
+
+  return start;
+}
+
+#undef __
diff --git a/src/hotspot/share/runtime/stubRoutines.cpp b/src/hotspot/share/runtime/stubRoutines.cpp
index 7a6974088ba43..73bc3a003fbbf 100644
--- a/src/hotspot/share/runtime/stubRoutines.cpp
+++ b/src/hotspot/share/runtime/stubRoutines.cpp
@@ -161,6 +161,7 @@ address StubRoutines::_vectorizedMismatch = nullptr;
 address StubRoutines::_dexp = nullptr;
 address StubRoutines::_dlog = nullptr;
 address StubRoutines::_dlog10 = nullptr;
+address StubRoutines::_fmod = nullptr;
 address StubRoutines::_dpow = nullptr;
 address StubRoutines::_dsin = nullptr;
 address StubRoutines::_dcos = nullptr;
diff --git a/src/hotspot/share/runtime/stubRoutines.hpp b/src/hotspot/share/runtime/stubRoutines.hpp
index 5ce9176f08a2f..ae37b9409e90b 100644
--- a/src/hotspot/share/runtime/stubRoutines.hpp
+++ b/src/hotspot/share/runtime/stubRoutines.hpp
@@ -249,6 +249,7 @@ class StubRoutines: AllStatic {
   static address _dlibm_reduce_pi04l;
   static address _dlibm_tan_cot_huge;
   static address _dtan;
+  static address _fmod;
 
   static address _f2hf;
   static address _hf2f;
@@ -425,6 +426,7 @@ class StubRoutines: AllStatic {
   static address dlog()                { return _dlog; }
   static address dlog10()              { return _dlog10; }
   static address dpow()                { return _dpow; }
+  static address fmod()                { return _fmod; }
   static address dsin()                { return _dsin; }
   static address dcos()                { return _dcos; }
   static address dlibm_reduce_pi04l()  { return _dlibm_reduce_pi04l; }
diff --git a/src/hotspot/share/runtime/vmStructs.cpp b/src/hotspot/share/runtime/vmStructs.cpp
index 37241534b2b7e..cd3ccb79e3133 100644
--- a/src/hotspot/share/runtime/vmStructs.cpp
+++ b/src/hotspot/share/runtime/vmStructs.cpp
@@ -556,6 +556,7 @@
      static_field(StubRoutines,                _dlog,                                         address)                               \
      static_field(StubRoutines,                _dlog10,                                       address)                               \
      static_field(StubRoutines,                _dpow,                                         address)                               \
+     static_field(StubRoutines,                _fmod,                                         address)                               \
      static_field(StubRoutines,                _dsin,                                         address)                               \
      static_field(StubRoutines,                _dcos,                                         address)                               \
      static_field(StubRoutines,                _dtan,                                         address)                               \
diff --git a/test/hotspot/jtreg/compiler/floatingpoint/DmodTest.java b/test/hotspot/jtreg/compiler/floatingpoint/DmodTest.java
new file mode 100644
index 0000000000000..4f386a42de439
--- /dev/null
+++ b/test/hotspot/jtreg/compiler/floatingpoint/DmodTest.java
@@ -0,0 +1,131 @@
+/*
+ * Copyright (c) 2023, Intel Corporation. All rights reserved.
+ * Intel Math Library (LIBM) Source Code
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+/**
+ * @test
+ * @bug 8308966
+ * @summary Add intrinsic for float/double modulo for x86 AVX2 and AVX512
+ * @run main compiler.floatingpoint.DmodTest
+ */
+
+ package compiler.floatingpoint;
+
+ import java.lang.Double;
+
+ public class DmodTest {
+   static double [] op1 = { 1.2345d, 0.0d, -0.0d, 1.0d/0.0d, -1.0d/0.0d, 0.0d/0.0d };
+   static double [] op2 = { 1.2345d, 0.0d, -0.0d, 1.0d/0.0d, -1.0d/0.0d, 0.0d/0.0d };
+   static double [][] res = {
+      {
+        0.0d,
+        Double.NaN,
+        Double.NaN,
+        1.2345d,
+        1.2345d,
+        Double.NaN,
+      },
+      {
+        0.0d,
+        Double.NaN,
+        Double.NaN,
+        0.0d,
+        0.0d,
+        Double.NaN,
+      },
+      {
+        -0.0d,
+        Double.NaN,
+        Double.NaN,
+        -0.0d,
+        -0.0d,
+        Double.NaN,
+      },
+      {
+        Double.NaN,
+        Double.NaN,
+        Double.NaN,
+        Double.NaN,
+        Double.NaN,
+        Double.NaN,
+      },
+      {
+        Double.NaN,
+        Double.NaN,
+        Double.NaN,
+        Double.NaN,
+        Double.NaN,
+        Double.NaN,
+      },
+      {
+        Double.NaN,
+        Double.NaN,
+        Double.NaN,
+        Double.NaN,
+        Double.NaN,
+        Double.NaN,
+      },
+   };
+   public static void main(String[] args) throws Exception {
+     double f1, f2, f3;
+     boolean failure = false;
+     boolean print_failure = false;
+     for (int i = 0; i < 100_000; i++) {
+       for (int j = 0; j < op1.length; j++) {
+         for (int k = 0; k < op2.length; k++) {
+           f1 = op1[j];
+           f2 = op2[k];
+           f3 = f1 % f2;
+
+           if (Double.isNaN(res[j][k])) {
+             if (!Double.isNaN(f3)) {
+               failure = true;
+               print_failure = true;
+             }
+           } else if (Double.isNaN(f3)) {
+             failure = true;
+             print_failure = true;
+           } else if (f3 != res[j][k]) {
+             failure = true;
+             print_failure = true;
+           }
+
+           if (print_failure) {
+             System.out.println( "Actual   " + f1 + " % " + f2 + " = " + f3);
+             System.out.println( "Expected " + f1 + " % " + f2 + " = " + res[j][k]);
+             print_failure = false;
+           }
+         }
+       }
+     }
+
+    if (failure) {
+      throw new RuntimeException("Test Failed");
+    } else {
+      System.out.println("Test passed.");
+    }
+ }
+}
+
diff --git a/test/hotspot/jtreg/compiler/floatingpoint/FmodTest.java b/test/hotspot/jtreg/compiler/floatingpoint/FmodTest.java
new file mode 100644
index 0000000000000..859fd34650a51
--- /dev/null
+++ b/test/hotspot/jtreg/compiler/floatingpoint/FmodTest.java
@@ -0,0 +1,130 @@
+/*
+ * Copyright (c) 2023, Intel Corporation. All rights reserved.
+ * Intel Math Library (LIBM) Source Code
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+/**
+ * @test
+ * @bug 8308966
+ * @summary Add intrinsic for float/double modulo for x86 AVX2 and AVX512
+ * @run main compiler.floatingpoint.FmodTest
+ */
+
+ package compiler.floatingpoint;
+
+ import java.lang.Float;
+
+ public class FmodTest {
+   static float [] op1 = { 1.2345f, 0.0f, -0.0f, 1.0f/0.0f, -1.0f/0.0f, 0.0f/0.0f };
+   static float [] op2 = { 1.2345f, 0.0f, -0.0f, 1.0f/0.0f, -1.0f/0.0f, 0.0f/0.0f };
+   static float [][] res = {
+      {
+        0.0f,
+        Float.NaN,
+        Float.NaN,
+        1.2345f,
+        1.2345f,
+        Float.NaN,
+      },
+      {
+        0.0f,
+        Float.NaN,
+        Float.NaN,
+        0.0f,
+        0.0f,
+        Float.NaN,
+      },
+      {
+        -0.0f,
+        Float.NaN,
+        Float.NaN,
+        -0.0f,
+        -0.0f,
+        Float.NaN,
+      },
+      {
+        Float.NaN,
+        Float.NaN,
+        Float.NaN,
+        Float.NaN,
+        Float.NaN,
+        Float.NaN,
+      },
+      {
+        Float.NaN,
+        Float.NaN,
+        Float.NaN,
+        Float.NaN,
+        Float.NaN,
+        Float.NaN,
+      },
+      {
+        Float.NaN,
+        Float.NaN,
+        Float.NaN,
+        Float.NaN,
+        Float.NaN,
+        Float.NaN,
+      },
+   };
+   public static void main(String[] args) throws Exception {
+     float f1, f2, f3;
+     boolean failure = false;
+     boolean print_failure = false;
+     for (int i = 0; i < 100_000; i++) {
+       for (int j = 0; j < op1.length; j++) {
+         for (int k = 0; k < op2.length; k++) {
+           f1 = op1[j];
+           f2 = op2[k];
+           f3 = f1 % f2;
+
+           if (Float.isNaN(res[j][k])) {
+             if (!Float.isNaN(f3)) {
+               failure = true;
+               print_failure = true;
+             }
+           } else if (Float.isNaN(f3)) {
+             failure = true;
+             print_failure = true;
+           } else if (f3 != res[j][k]) {
+             failure = true;
+             print_failure = true;
+           }
+
+           if (print_failure) {
+             System.out.println( "Actual   " + f1 + " % " + f2 + " = " + f3);
+             System.out.println( "Expected " + f1 + " % " + f2 + " = " + res[j][k]);
+             print_failure = false;
+           }
+         }
+       }
+     }
+
+     if (failure) {
+       throw new RuntimeException("Test Failed");
+     } else {
+       System.out.println("Test passed.");
+     }
+   }
+ }