From a46a2c2b7d8daf68c793b6b96f1c5bc009aea62b Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Tue, 25 Jun 2024 12:25:56 +0100 Subject: [PATCH] [X86] Lower vXi8 multiplies using PMADDUBSW on SSSE3+ targets (#95690) Extends https://github.com/llvm/llvm-project/pull/95403 to handle non-constant cases - we can avoid unpacks/extensions from vXi8 to vXi16 by using PMADDUBSW instead and truncating the vXi16 results back together. Most targets benefit from performing this for non-constant cases - its just Intel Core/SandyBridge era CPUs that might experience additional Port0/15 contention (but lower instruction count). Fixes https://github.com/llvm/llvm-project/issues/90748 --- llvm/lib/Target/X86/X86ISelLowering.cpp | 22 +- .../lib/Target/X86/X86TargetTransformInfo.cpp | 20 +- .../CostModel/X86/arith-int-codesize.ll | 24 +- .../CostModel/X86/arith-int-latency.ll | 16 +- .../CostModel/X86/arith-int-sizelatency.ll | 16 +- llvm/test/Analysis/CostModel/X86/arith-int.ll | 40 +- .../Analysis/CostModel/X86/rem-codesize.ll | 72 ++- .../Analysis/CostModel/X86/rem-latency.ll | 28 +- .../Analysis/CostModel/X86/rem-sizelatency.ll | 28 +- llvm/test/Analysis/CostModel/X86/rem.ll | 142 ++++- .../Analysis/CostModel/X86/slm-arith-costs.ll | 2 +- llvm/test/CodeGen/X86/avx2-arith.ll | 18 +- llvm/test/CodeGen/X86/midpoint-int-vec-128.ll | 391 ++++++------ llvm/test/CodeGen/X86/midpoint-int-vec-256.ll | 595 ++++++++---------- .../CodeGen/X86/min-legal-vector-width.ll | 133 ++-- llvm/test/CodeGen/X86/pmul.ll | 266 ++++---- .../CodeGen/X86/prefer-avx256-wide-mul.ll | 17 +- .../X86/vector-shuffle-combining-sse41.ll | 70 +-- 18 files changed, 953 insertions(+), 947 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 82d2b301d854e..2a55ab3fafdd7 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -28503,17 +28503,19 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget, MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts / 2); - // For vXi8 mul-by-constant, try PMADDUBSW to avoid the need for extension. + // For vXi8 mul, try PMADDUBSW to avoid the need for extension. // Don't do this if we only need to unpack one half. - if (Subtarget.hasSSSE3() && - ISD::isBuildVectorOfConstantSDNodes(B.getNode())) { - bool IsLoLaneAllZeroOrUndef = true; - bool IsHiLaneAllZeroOrUndef = true; - for (auto [Idx, Val] : enumerate(B->ops())) { - if ((Idx % NumEltsPerLane) >= (NumEltsPerLane / 2)) - IsHiLaneAllZeroOrUndef &= isNullConstantOrUndef(Val); - else - IsLoLaneAllZeroOrUndef &= isNullConstantOrUndef(Val); + if (Subtarget.hasSSSE3()) { + bool BIsBuildVector = isa(B); + bool IsLoLaneAllZeroOrUndef = BIsBuildVector; + bool IsHiLaneAllZeroOrUndef = BIsBuildVector; + if (BIsBuildVector) { + for (auto [Idx, Val] : enumerate(B->ops())) { + if ((Idx % NumEltsPerLane) >= (NumEltsPerLane / 2)) + IsHiLaneAllZeroOrUndef &= isNullConstantOrUndef(Val); + else + IsLoLaneAllZeroOrUndef &= isNullConstantOrUndef(Val); + } } if (!(IsLoLaneAllZeroOrUndef || IsHiLaneAllZeroOrUndef)) { SDValue Mask = DAG.getBitcast(VT, DAG.getConstant(0x00FF, dl, ExVT)); diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp index de0144331dba3..563890ede9b24 100644 --- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -852,8 +852,8 @@ InstructionCost X86TTIImpl::getArithmeticInstrCost( { ISD::SUB, MVT::v32i16, { 1, 1, 1, 1 } }, // psubw { ISD::MUL, MVT::v16i8, { 4, 12, 4, 5 } }, // extend/pmullw/trunc - { ISD::MUL, MVT::v32i8, { 6, 11,10,11 } }, // extend/pmullw/trunc - { ISD::MUL, MVT::v64i8, { 6, 12,10,11 } }, // unpack/pmullw + { ISD::MUL, MVT::v32i8, { 3, 10, 7,10 } }, // pmaddubsw + { ISD::MUL, MVT::v64i8, { 3, 11, 7,10 } }, // pmaddubsw { ISD::MUL, MVT::v32i16, { 1, 5, 1, 1 } }, // pmullw { ISD::SUB, MVT::v32i8, { 1, 1, 1, 1 } }, // psubb @@ -1119,7 +1119,7 @@ InstructionCost X86TTIImpl::getArithmeticInstrCost( { ISD::ADD, MVT::v4i64, { 1, 1, 1, 2 } }, // paddq { ISD::MUL, MVT::v16i8, { 5, 18, 6,12 } }, // extend/pmullw/pack - { ISD::MUL, MVT::v32i8, { 6, 11,10,20 } }, // unpack/pmullw + { ISD::MUL, MVT::v32i8, { 4, 8, 8,16 } }, // pmaddubsw { ISD::MUL, MVT::v16i16, { 2, 5, 1, 2 } }, // pmullw { ISD::MUL, MVT::v8i32, { 4, 10, 1, 2 } }, // pmulld { ISD::MUL, MVT::v4i32, { 2, 10, 1, 2 } }, // pmulld @@ -1170,8 +1170,8 @@ InstructionCost X86TTIImpl::getArithmeticInstrCost( // We don't have to scalarize unsupported ops. We can issue two half-sized // operations and we only need to extract the upper YMM half. // Two ops + 1 extract + 1 insert = 4. - { ISD::MUL, MVT::v32i8, { 12, 12, 22, 23 } }, // unpack/pmullw + split - { ISD::MUL, MVT::v16i8, { 5, 6, 10, 12 } }, // unpack/pmullw + { ISD::MUL, MVT::v32i8, { 10, 11, 18, 19 } }, // pmaddubsw + split + { ISD::MUL, MVT::v16i8, { 5, 6, 8, 12 } }, // 2*pmaddubsw/3*and/psllw/or { ISD::MUL, MVT::v16i16, { 4, 8, 5, 6 } }, // pmullw + split { ISD::MUL, MVT::v8i32, { 5, 8, 5, 10 } }, // pmulld + split { ISD::MUL, MVT::v4i32, { 2, 5, 1, 3 } }, // pmulld @@ -1311,7 +1311,6 @@ InstructionCost X86TTIImpl::getArithmeticInstrCost( { ISD::SRA, MVT::v4i32, { 16, 17,15,19 } }, // Shift each lane + blend. { ISD::SRA, MVT::v2i64, { 8, 17, 5, 7 } }, // splat+shuffle sequence. - { ISD::MUL, MVT::v16i8, { 6, 18,10,12 } }, // 2*unpack/2*pmullw/2*and/pack { ISD::MUL, MVT::v4i32, { 2, 11, 1, 1 } } // pmulld (Nehalem from agner.org) }; @@ -1320,6 +1319,15 @@ InstructionCost X86TTIImpl::getArithmeticInstrCost( if (auto KindCost = Entry->Cost[CostKind]) return LT.first * *KindCost; + static const CostKindTblEntry SSSE3CostTable[] = { + { ISD::MUL, MVT::v16i8, { 5, 18,10,12 } }, // 2*pmaddubsw/3*and/psllw/or + }; + + if (ST->hasSSSE3()) + if (const auto *Entry = CostTableLookup(SSSE3CostTable, ISD, LT.second)) + if (auto KindCost = Entry->Cost[CostKind]) + return LT.first * *KindCost; + static const CostKindTblEntry SSE2CostTable[] = { // We don't correctly identify costs of casts because they are marked as // custom. diff --git a/llvm/test/Analysis/CostModel/X86/arith-int-codesize.ll b/llvm/test/Analysis/CostModel/X86/arith-int-codesize.ll index 050beb7fc25a3..f5ad65817950e 100644 --- a/llvm/test/Analysis/CostModel/X86/arith-int-codesize.ll +++ b/llvm/test/Analysis/CostModel/X86/arith-int-codesize.ll @@ -791,9 +791,9 @@ define i32 @mul(i32 %arg) { ; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2I8 = mul <2 x i8> undef, undef ; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I8 = mul <4 x i8> undef, undef ; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I8 = mul <8 x i8> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I8 = mul <16 x i8> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32I8 = mul <32 x i8> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V64I8 = mul <64 x i8> undef, undef +; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16I8 = mul <16 x i8> undef, undef +; SSE2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V32I8 = mul <32 x i8> undef, undef +; SSE2-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V64I8 = mul <64 x i8> undef, undef ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; SSE42-LABEL: 'mul' @@ -835,9 +835,9 @@ define i32 @mul(i32 %arg) { ; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2I8 = mul <2 x i8> undef, undef ; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I8 = mul <4 x i8> undef, undef ; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I8 = mul <8 x i8> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16I8 = mul <16 x i8> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V32I8 = mul <32 x i8> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V64I8 = mul <64 x i8> undef, undef +; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I8 = mul <16 x i8> undef, undef +; AVX1-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V32I8 = mul <32 x i8> undef, undef +; AVX1-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V64I8 = mul <64 x i8> undef, undef ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; AVX2-LABEL: 'mul' @@ -858,8 +858,8 @@ define i32 @mul(i32 %arg) { ; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I8 = mul <4 x i8> undef, undef ; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I8 = mul <8 x i8> undef, undef ; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = mul <16 x i8> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32I8 = mul <32 x i8> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V64I8 = mul <64 x i8> undef, undef +; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I8 = mul <32 x i8> undef, undef +; AVX2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I8 = mul <64 x i8> undef, undef ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; AVX512F-LABEL: 'mul' @@ -880,7 +880,7 @@ define i32 @mul(i32 %arg) { ; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I8 = mul <4 x i8> undef, undef ; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I8 = mul <8 x i8> undef, undef ; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = mul <16 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32I8 = mul <32 x i8> undef, undef +; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I8 = mul <32 x i8> undef, undef ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = mul <64 x i8> undef, undef ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; @@ -902,8 +902,8 @@ define i32 @mul(i32 %arg) { ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I8 = mul <4 x i8> undef, undef ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I8 = mul <8 x i8> undef, undef ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I8 = mul <16 x i8> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32I8 = mul <32 x i8> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V64I8 = mul <64 x i8> undef, undef +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V32I8 = mul <32 x i8> undef, undef +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V64I8 = mul <64 x i8> undef, undef ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; AVX512DQ-LABEL: 'mul' @@ -924,7 +924,7 @@ define i32 @mul(i32 %arg) { ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I8 = mul <4 x i8> undef, undef ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I8 = mul <8 x i8> undef, undef ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = mul <16 x i8> undef, undef -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32I8 = mul <32 x i8> undef, undef +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I8 = mul <32 x i8> undef, undef ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = mul <64 x i8> undef, undef ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; diff --git a/llvm/test/Analysis/CostModel/X86/arith-int-latency.ll b/llvm/test/Analysis/CostModel/X86/arith-int-latency.ll index 6cf278e98bd85..ed58f0f554e23 100644 --- a/llvm/test/Analysis/CostModel/X86/arith-int-latency.ll +++ b/llvm/test/Analysis/CostModel/X86/arith-int-latency.ll @@ -680,8 +680,8 @@ define i32 @mul(i32 %arg) { ; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4I8 = mul <4 x i8> undef, undef ; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8I8 = mul <8 x i8> undef, undef ; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = mul <16 x i8> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I8 = mul <32 x i8> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64I8 = mul <64 x i8> undef, undef +; AVX1-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32I8 = mul <32 x i8> undef, undef +; AVX1-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V64I8 = mul <64 x i8> undef, undef ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; AVX2-LABEL: 'mul' @@ -702,8 +702,8 @@ define i32 @mul(i32 %arg) { ; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4I8 = mul <4 x i8> undef, undef ; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8I8 = mul <8 x i8> undef, undef ; AVX2-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V16I8 = mul <16 x i8> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32I8 = mul <32 x i8> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V64I8 = mul <64 x i8> undef, undef +; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I8 = mul <32 x i8> undef, undef +; AVX2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64I8 = mul <64 x i8> undef, undef ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; AVX512F-LABEL: 'mul' @@ -724,7 +724,7 @@ define i32 @mul(i32 %arg) { ; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4I8 = mul <4 x i8> undef, undef ; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8I8 = mul <8 x i8> undef, undef ; AVX512F-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V16I8 = mul <16 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32I8 = mul <32 x i8> undef, undef +; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I8 = mul <32 x i8> undef, undef ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = mul <64 x i8> undef, undef ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; @@ -746,8 +746,8 @@ define i32 @mul(i32 %arg) { ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4I8 = mul <4 x i8> undef, undef ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8I8 = mul <8 x i8> undef, undef ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I8 = mul <16 x i8> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32I8 = mul <32 x i8> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = mul <64 x i8> undef, undef +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32I8 = mul <32 x i8> undef, undef +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V64I8 = mul <64 x i8> undef, undef ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; AVX512DQ-LABEL: 'mul' @@ -768,7 +768,7 @@ define i32 @mul(i32 %arg) { ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4I8 = mul <4 x i8> undef, undef ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8I8 = mul <8 x i8> undef, undef ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V16I8 = mul <16 x i8> undef, undef -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32I8 = mul <32 x i8> undef, undef +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I8 = mul <32 x i8> undef, undef ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = mul <64 x i8> undef, undef ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; diff --git a/llvm/test/Analysis/CostModel/X86/arith-int-sizelatency.ll b/llvm/test/Analysis/CostModel/X86/arith-int-sizelatency.ll index b5ca132d8c51d..c9ee064822636 100644 --- a/llvm/test/Analysis/CostModel/X86/arith-int-sizelatency.ll +++ b/llvm/test/Analysis/CostModel/X86/arith-int-sizelatency.ll @@ -680,8 +680,8 @@ define i32 @mul(i32 %arg) { ; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I8 = mul <4 x i8> undef, undef ; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I8 = mul <8 x i8> undef, undef ; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I8 = mul <16 x i8> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V32I8 = mul <32 x i8> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V64I8 = mul <64 x i8> undef, undef +; AVX1-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V32I8 = mul <32 x i8> undef, undef +; AVX1-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %V64I8 = mul <64 x i8> undef, undef ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; AVX2-LABEL: 'mul' @@ -702,8 +702,8 @@ define i32 @mul(i32 %arg) { ; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I8 = mul <4 x i8> undef, undef ; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I8 = mul <8 x i8> undef, undef ; AVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I8 = mul <16 x i8> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V32I8 = mul <32 x i8> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V64I8 = mul <64 x i8> undef, undef +; AVX2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I8 = mul <32 x i8> undef, undef +; AVX2-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V64I8 = mul <64 x i8> undef, undef ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; AVX512F-LABEL: 'mul' @@ -724,7 +724,7 @@ define i32 @mul(i32 %arg) { ; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I8 = mul <4 x i8> undef, undef ; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I8 = mul <8 x i8> undef, undef ; AVX512F-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I8 = mul <16 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V32I8 = mul <32 x i8> undef, undef +; AVX512F-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I8 = mul <32 x i8> undef, undef ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = mul <64 x i8> undef, undef ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; @@ -746,8 +746,8 @@ define i32 @mul(i32 %arg) { ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I8 = mul <4 x i8> undef, undef ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I8 = mul <8 x i8> undef, undef ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V16I8 = mul <16 x i8> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32I8 = mul <32 x i8> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V64I8 = mul <64 x i8> undef, undef +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32I8 = mul <32 x i8> undef, undef +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V64I8 = mul <64 x i8> undef, undef ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; AVX512DQ-LABEL: 'mul' @@ -768,7 +768,7 @@ define i32 @mul(i32 %arg) { ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I8 = mul <4 x i8> undef, undef ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I8 = mul <8 x i8> undef, undef ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16I8 = mul <16 x i8> undef, undef -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V32I8 = mul <32 x i8> undef, undef +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32I8 = mul <32 x i8> undef, undef ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = mul <64 x i8> undef, undef ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; diff --git a/llvm/test/Analysis/CostModel/X86/arith-int.ll b/llvm/test/Analysis/CostModel/X86/arith-int.ll index e0f7afd3eb42d..28e36ad88a88c 100644 --- a/llvm/test/Analysis/CostModel/X86/arith-int.ll +++ b/llvm/test/Analysis/CostModel/X86/arith-int.ll @@ -905,9 +905,9 @@ define i32 @mul(i32 %arg) { ; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I8 = mul <2 x i8> undef, undef ; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I8 = mul <4 x i8> undef, undef ; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I8 = mul <8 x i8> undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = mul <16 x i8> undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I8 = mul <32 x i8> undef, undef -; SSSE3-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64I8 = mul <64 x i8> undef, undef +; SSSE3-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V16I8 = mul <16 x i8> undef, undef +; SSSE3-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32I8 = mul <32 x i8> undef, undef +; SSSE3-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V64I8 = mul <64 x i8> undef, undef ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSE42-LABEL: 'mul' @@ -927,9 +927,9 @@ define i32 @mul(i32 %arg) { ; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I8 = mul <2 x i8> undef, undef ; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I8 = mul <4 x i8> undef, undef ; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I8 = mul <8 x i8> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = mul <16 x i8> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I8 = mul <32 x i8> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64I8 = mul <64 x i8> undef, undef +; SSE42-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V16I8 = mul <16 x i8> undef, undef +; SSE42-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32I8 = mul <32 x i8> undef, undef +; SSE42-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V64I8 = mul <64 x i8> undef, undef ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX1-LABEL: 'mul' @@ -950,8 +950,8 @@ define i32 @mul(i32 %arg) { ; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I8 = mul <4 x i8> undef, undef ; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I8 = mul <8 x i8> undef, undef ; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V16I8 = mul <16 x i8> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I8 = mul <32 x i8> undef, undef -; AVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64I8 = mul <64 x i8> undef, undef +; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32I8 = mul <32 x i8> undef, undef +; AVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V64I8 = mul <64 x i8> undef, undef ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX2-LABEL: 'mul' @@ -972,8 +972,8 @@ define i32 @mul(i32 %arg) { ; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I8 = mul <4 x i8> undef, undef ; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I8 = mul <8 x i8> undef, undef ; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V16I8 = mul <16 x i8> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = mul <32 x i8> undef, undef -; AVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = mul <64 x i8> undef, undef +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = mul <32 x i8> undef, undef +; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = mul <64 x i8> undef, undef ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512F-LABEL: 'mul' @@ -994,7 +994,7 @@ define i32 @mul(i32 %arg) { ; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I8 = mul <4 x i8> undef, undef ; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I8 = mul <8 x i8> undef, undef ; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V16I8 = mul <16 x i8> undef, undef -; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = mul <32 x i8> undef, undef +; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = mul <32 x i8> undef, undef ; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = mul <64 x i8> undef, undef ; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; @@ -1016,8 +1016,8 @@ define i32 @mul(i32 %arg) { ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I8 = mul <4 x i8> undef, undef ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I8 = mul <8 x i8> undef, undef ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I8 = mul <16 x i8> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = mul <32 x i8> undef, undef -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V64I8 = mul <64 x i8> undef, undef +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32I8 = mul <32 x i8> undef, undef +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V64I8 = mul <64 x i8> undef, undef ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512DQ-LABEL: 'mul' @@ -1038,7 +1038,7 @@ define i32 @mul(i32 %arg) { ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4I8 = mul <4 x i8> undef, undef ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8I8 = mul <8 x i8> undef, undef ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V16I8 = mul <16 x i8> undef, undef -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = mul <32 x i8> undef, undef +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = mul <32 x i8> undef, undef ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = mul <64 x i8> undef, undef ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; @@ -1059,9 +1059,9 @@ define i32 @mul(i32 %arg) { ; SLM-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2I8 = mul <2 x i8> undef, undef ; SLM-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4I8 = mul <4 x i8> undef, undef ; SLM-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8I8 = mul <8 x i8> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = mul <16 x i8> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I8 = mul <32 x i8> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64I8 = mul <64 x i8> undef, undef +; SLM-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V16I8 = mul <16 x i8> undef, undef +; SLM-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32I8 = mul <32 x i8> undef, undef +; SLM-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V64I8 = mul <64 x i8> undef, undef ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; GLM-LABEL: 'mul' @@ -1081,9 +1081,9 @@ define i32 @mul(i32 %arg) { ; GLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2I8 = mul <2 x i8> undef, undef ; GLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I8 = mul <4 x i8> undef, undef ; GLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I8 = mul <8 x i8> undef, undef -; GLM-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = mul <16 x i8> undef, undef -; GLM-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32I8 = mul <32 x i8> undef, undef -; GLM-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64I8 = mul <64 x i8> undef, undef +; GLM-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V16I8 = mul <16 x i8> undef, undef +; GLM-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32I8 = mul <32 x i8> undef, undef +; GLM-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V64I8 = mul <64 x i8> undef, undef ; GLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; %I64 = mul i64 undef, undef diff --git a/llvm/test/Analysis/CostModel/X86/rem-codesize.ll b/llvm/test/Analysis/CostModel/X86/rem-codesize.ll index 86e67eda2b1f2..b47580ba0abfd 100644 --- a/llvm/test/Analysis/CostModel/X86/rem-codesize.ll +++ b/llvm/test/Analysis/CostModel/X86/rem-codesize.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py ; RUN: opt < %s -mtriple=x86_64-apple-macosx10.8.0 -passes="print" 2>&1 -disable-output -cost-kind=code-size -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE2 -; RUN: opt < %s -mtriple=x86_64-apple-macosx10.8.0 -passes="print" 2>&1 -disable-output -cost-kind=code-size -mattr=+ssse3 | FileCheck %s --check-prefixes=CHECK,SSE,SSE2 +; RUN: opt < %s -mtriple=x86_64-apple-macosx10.8.0 -passes="print" 2>&1 -disable-output -cost-kind=code-size -mattr=+ssse3 | FileCheck %s --check-prefixes=CHECK,SSE,SSSE3 ; RUN: opt < %s -mtriple=x86_64-apple-macosx10.8.0 -passes="print" 2>&1 -disable-output -cost-kind=code-size -mattr=+sse4.2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE42 ; RUN: opt < %s -mtriple=x86_64-apple-macosx10.8.0 -passes="print" 2>&1 -disable-output -cost-kind=code-size -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX,AVX1 ; RUN: opt < %s -mtriple=x86_64-apple-macosx10.8.0 -passes="print" 2>&1 -disable-output -cost-kind=code-size -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2 @@ -289,6 +289,25 @@ define i32 @srem_constpow2() { ; SSE2-NEXT: Cost Model: Found an estimated cost of 590 for instruction: %V64i8 = srem <64 x i8> undef, ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; +; SSSE3-LABEL: 'srem_constpow2' +; SSSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %I64 = srem i64 undef, 16 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 41 for instruction: %V2i64 = srem <2 x i64> undef, +; SSSE3-NEXT: Cost Model: Found an estimated cost of 82 for instruction: %V4i64 = srem <4 x i64> undef, +; SSSE3-NEXT: Cost Model: Found an estimated cost of 164 for instruction: %V8i64 = srem <8 x i64> undef, +; SSSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %I32 = srem i32 undef, 16 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 54 for instruction: %V4i32 = srem <4 x i32> undef, +; SSSE3-NEXT: Cost Model: Found an estimated cost of 106 for instruction: %V8i32 = srem <8 x i32> undef, +; SSSE3-NEXT: Cost Model: Found an estimated cost of 210 for instruction: %V16i32 = srem <16 x i32> undef, +; SSSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %I16 = srem i16 undef, 16 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V8i16 = srem <8 x i16> undef, +; SSSE3-NEXT: Cost Model: Found an estimated cost of 190 for instruction: %V16i16 = srem <16 x i16> undef, +; SSSE3-NEXT: Cost Model: Found an estimated cost of 378 for instruction: %V32i16 = srem <32 x i16> undef, +; SSSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %I8 = srem i8 undef, 16 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 147 for instruction: %V16i8 = srem <16 x i8> undef, +; SSSE3-NEXT: Cost Model: Found an estimated cost of 292 for instruction: %V32i8 = srem <32 x i8> undef, +; SSSE3-NEXT: Cost Model: Found an estimated cost of 582 for instruction: %V64i8 = srem <64 x i8> undef, +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef +; ; SSE42-LABEL: 'srem_constpow2' ; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %I64 = srem i64 undef, 16 ; SSE42-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V2i64 = srem <2 x i64> undef, @@ -322,9 +341,9 @@ define i32 @srem_constpow2() { ; AVX1-NEXT: Cost Model: Found an estimated cost of 108 for instruction: %V16i16 = srem <16 x i16> undef, ; AVX1-NEXT: Cost Model: Found an estimated cost of 216 for instruction: %V32i16 = srem <32 x i16> undef, ; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %I8 = srem i8 undef, 16 -; AVX1-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %V16i8 = srem <16 x i8> undef, -; AVX1-NEXT: Cost Model: Found an estimated cost of 164 for instruction: %V32i8 = srem <32 x i8> undef, -; AVX1-NEXT: Cost Model: Found an estimated cost of 328 for instruction: %V64i8 = srem <64 x i8> undef, +; AVX1-NEXT: Cost Model: Found an estimated cost of 70 for instruction: %V16i8 = srem <16 x i8> undef, +; AVX1-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %V32i8 = srem <32 x i8> undef, +; AVX1-NEXT: Cost Model: Found an estimated cost of 320 for instruction: %V64i8 = srem <64 x i8> undef, ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; AVX2-LABEL: 'srem_constpow2' @@ -342,8 +361,8 @@ define i32 @srem_constpow2() { ; AVX2-NEXT: Cost Model: Found an estimated cost of 66 for instruction: %V32i16 = srem <32 x i16> undef, ; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %I8 = srem i8 undef, 16 ; AVX2-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %V16i8 = srem <16 x i8> undef, -; AVX2-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %V32i8 = srem <32 x i8> undef, -; AVX2-NEXT: Cost Model: Found an estimated cost of 144 for instruction: %V64i8 = srem <64 x i8> undef, +; AVX2-NEXT: Cost Model: Found an estimated cost of 70 for instruction: %V32i8 = srem <32 x i8> undef, +; AVX2-NEXT: Cost Model: Found an estimated cost of 140 for instruction: %V64i8 = srem <64 x i8> undef, ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; AVX512F-LABEL: 'srem_constpow2' @@ -361,7 +380,7 @@ define i32 @srem_constpow2() { ; AVX512F-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V32i16 = srem <32 x i16> undef, ; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %I8 = srem i8 undef, 16 ; AVX512F-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %V16i8 = srem <16 x i8> undef, -; AVX512F-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %V32i8 = srem <32 x i8> undef, +; AVX512F-NEXT: Cost Model: Found an estimated cost of 70 for instruction: %V32i8 = srem <32 x i8> undef, ; AVX512F-NEXT: Cost Model: Found an estimated cost of 143 for instruction: %V64i8 = srem <64 x i8> undef, ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; @@ -380,8 +399,8 @@ define i32 @srem_constpow2() { ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32i16 = srem <32 x i16> undef, ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %I8 = srem i8 undef, 16 ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V16i8 = srem <16 x i8> undef, -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %V32i8 = srem <32 x i8> undef, -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 87 for instruction: %V64i8 = srem <64 x i8> undef, +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 69 for instruction: %V32i8 = srem <32 x i8> undef, +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 84 for instruction: %V64i8 = srem <64 x i8> undef, ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; SLM-LABEL: 'srem_constpow2' @@ -527,6 +546,25 @@ define i32 @srem_uniformconstpow2() { ; SSE2-NEXT: Cost Model: Found an estimated cost of 98 for instruction: %V64i8 = srem <64 x i8> undef, ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; +; SSSE3-LABEL: 'srem_uniformconstpow2' +; SSSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %I64 = srem i64 undef, 16 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V2i64 = srem <2 x i64> undef, +; SSSE3-NEXT: Cost Model: Found an estimated cost of 50 for instruction: %V4i64 = srem <4 x i64> undef, +; SSSE3-NEXT: Cost Model: Found an estimated cost of 100 for instruction: %V8i64 = srem <8 x i64> undef, +; SSSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %I32 = srem i32 undef, 16 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4i32 = srem <4 x i32> undef, +; SSSE3-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V8i32 = srem <8 x i32> undef, +; SSSE3-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %V16i32 = srem <16 x i32> undef, +; SSSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %I16 = srem i16 undef, 16 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8i16 = srem <8 x i16> undef, +; SSSE3-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16i16 = srem <16 x i16> undef, +; SSSE3-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V32i16 = srem <32 x i16> undef, +; SSSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %I8 = srem i8 undef, 16 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16i8 = srem <16 x i8> undef, +; SSSE3-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V32i8 = srem <32 x i8> undef, +; SSSE3-NEXT: Cost Model: Found an estimated cost of 90 for instruction: %V64i8 = srem <64 x i8> undef, +; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef +; ; SSE42-LABEL: 'srem_uniformconstpow2' ; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %I64 = srem i64 undef, 16 ; SSE42-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V2i64 = srem <2 x i64> undef, @@ -560,9 +598,9 @@ define i32 @srem_uniformconstpow2() { ; AVX1-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V16i16 = srem <16 x i16> undef, ; AVX1-NEXT: Cost Model: Found an estimated cost of 54 for instruction: %V32i16 = srem <32 x i16> undef, ; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %I8 = srem i8 undef, 16 -; AVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16i8 = srem <16 x i8> undef, -; AVX1-NEXT: Cost Model: Found an estimated cost of 63 for instruction: %V32i8 = srem <32 x i8> undef, -; AVX1-NEXT: Cost Model: Found an estimated cost of 126 for instruction: %V64i8 = srem <64 x i8> undef, +; AVX1-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V16i8 = srem <16 x i8> undef, +; AVX1-NEXT: Cost Model: Found an estimated cost of 59 for instruction: %V32i8 = srem <32 x i8> undef, +; AVX1-NEXT: Cost Model: Found an estimated cost of 118 for instruction: %V64i8 = srem <64 x i8> undef, ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; AVX2-LABEL: 'srem_uniformconstpow2' @@ -580,8 +618,8 @@ define i32 @srem_uniformconstpow2() { ; AVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32i16 = srem <32 x i16> undef, ; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %I8 = srem i8 undef, 16 ; AVX2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V16i8 = srem <16 x i8> undef, -; AVX2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32i8 = srem <32 x i8> undef, -; AVX2-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V64i8 = srem <64 x i8> undef, +; AVX2-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V32i8 = srem <32 x i8> undef, +; AVX2-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V64i8 = srem <64 x i8> undef, ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; AVX512F-LABEL: 'srem_uniformconstpow2' @@ -599,7 +637,7 @@ define i32 @srem_uniformconstpow2() { ; AVX512F-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V32i16 = srem <32 x i16> undef, ; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %I8 = srem i8 undef, 16 ; AVX512F-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V16i8 = srem <16 x i8> undef, -; AVX512F-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32i8 = srem <32 x i8> undef, +; AVX512F-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V32i8 = srem <32 x i8> undef, ; AVX512F-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V64i8 = srem <64 x i8> undef, ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; @@ -618,8 +656,8 @@ define i32 @srem_uniformconstpow2() { ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32i16 = srem <32 x i16> undef, ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %I8 = srem i8 undef, 16 ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16i8 = srem <16 x i8> undef, -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V32i8 = srem <32 x i8> undef, -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V64i8 = srem <64 x i8> undef, +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V32i8 = srem <32 x i8> undef, +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V64i8 = srem <64 x i8> undef, ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; SLM-LABEL: 'srem_uniformconstpow2' diff --git a/llvm/test/Analysis/CostModel/X86/rem-latency.ll b/llvm/test/Analysis/CostModel/X86/rem-latency.ll index 1428c6cdaca63..5264a75659d15 100644 --- a/llvm/test/Analysis/CostModel/X86/rem-latency.ll +++ b/llvm/test/Analysis/CostModel/X86/rem-latency.ll @@ -323,8 +323,8 @@ define i32 @srem_constpow2() { ; AVX1-NEXT: Cost Model: Found an estimated cost of 204 for instruction: %V32i16 = srem <32 x i16> undef, ; AVX1-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %I8 = srem i8 undef, 16 ; AVX1-NEXT: Cost Model: Found an estimated cost of 79 for instruction: %V16i8 = srem <16 x i8> undef, -; AVX1-NEXT: Cost Model: Found an estimated cost of 129 for instruction: %V32i8 = srem <32 x i8> undef, -; AVX1-NEXT: Cost Model: Found an estimated cost of 258 for instruction: %V64i8 = srem <64 x i8> undef, +; AVX1-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %V32i8 = srem <32 x i8> undef, +; AVX1-NEXT: Cost Model: Found an estimated cost of 256 for instruction: %V64i8 = srem <64 x i8> undef, ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; AVX2-LABEL: 'srem_constpow2' @@ -342,8 +342,8 @@ define i32 @srem_constpow2() { ; AVX2-NEXT: Cost Model: Found an estimated cost of 74 for instruction: %V32i16 = srem <32 x i16> undef, ; AVX2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %I8 = srem i8 undef, 16 ; AVX2-NEXT: Cost Model: Found an estimated cost of 81 for instruction: %V16i8 = srem <16 x i8> undef, -; AVX2-NEXT: Cost Model: Found an estimated cost of 83 for instruction: %V32i8 = srem <32 x i8> undef, -; AVX2-NEXT: Cost Model: Found an estimated cost of 166 for instruction: %V64i8 = srem <64 x i8> undef, +; AVX2-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %V32i8 = srem <32 x i8> undef, +; AVX2-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %V64i8 = srem <64 x i8> undef, ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; AVX512F-LABEL: 'srem_constpow2' @@ -361,7 +361,7 @@ define i32 @srem_constpow2() { ; AVX512F-NEXT: Cost Model: Found an estimated cost of 63 for instruction: %V32i16 = srem <32 x i16> undef, ; AVX512F-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %I8 = srem i8 undef, 16 ; AVX512F-NEXT: Cost Model: Found an estimated cost of 81 for instruction: %V16i8 = srem <16 x i8> undef, -; AVX512F-NEXT: Cost Model: Found an estimated cost of 83 for instruction: %V32i8 = srem <32 x i8> undef, +; AVX512F-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %V32i8 = srem <32 x i8> undef, ; AVX512F-NEXT: Cost Model: Found an estimated cost of 108 for instruction: %V64i8 = srem <64 x i8> undef, ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; @@ -380,8 +380,8 @@ define i32 @srem_constpow2() { ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32i16 = srem <32 x i16> undef, ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %I8 = srem i8 undef, 16 ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %V16i8 = srem <16 x i8> undef, -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 69 for instruction: %V32i8 = srem <32 x i8> undef, -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 71 for instruction: %V64i8 = srem <64 x i8> undef, +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %V32i8 = srem <32 x i8> undef, +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 70 for instruction: %V64i8 = srem <64 x i8> undef, ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; SLM-LABEL: 'srem_constpow2' @@ -561,8 +561,8 @@ define i32 @srem_uniformconstpow2() { ; AVX1-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %V32i16 = srem <32 x i16> undef, ; AVX1-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %I8 = srem i8 undef, 16 ; AVX1-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %V16i8 = srem <16 x i8> undef, -; AVX1-NEXT: Cost Model: Found an estimated cost of 37 for instruction: %V32i8 = srem <32 x i8> undef, -; AVX1-NEXT: Cost Model: Found an estimated cost of 74 for instruction: %V64i8 = srem <64 x i8> undef, +; AVX1-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V32i8 = srem <32 x i8> undef, +; AVX1-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %V64i8 = srem <64 x i8> undef, ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; AVX2-LABEL: 'srem_uniformconstpow2' @@ -580,8 +580,8 @@ define i32 @srem_uniformconstpow2() { ; AVX2-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V32i16 = srem <32 x i16> undef, ; AVX2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %I8 = srem i8 undef, 16 ; AVX2-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16i8 = srem <16 x i8> undef, -; AVX2-NEXT: Cost Model: Found an estimated cost of 41 for instruction: %V32i8 = srem <32 x i8> undef, -; AVX2-NEXT: Cost Model: Found an estimated cost of 82 for instruction: %V64i8 = srem <64 x i8> undef, +; AVX2-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %V32i8 = srem <32 x i8> undef, +; AVX2-NEXT: Cost Model: Found an estimated cost of 76 for instruction: %V64i8 = srem <64 x i8> undef, ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; AVX512F-LABEL: 'srem_uniformconstpow2' @@ -599,7 +599,7 @@ define i32 @srem_uniformconstpow2() { ; AVX512F-NEXT: Cost Model: Found an estimated cost of 45 for instruction: %V32i16 = srem <32 x i16> undef, ; AVX512F-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %I8 = srem i8 undef, 16 ; AVX512F-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16i8 = srem <16 x i8> undef, -; AVX512F-NEXT: Cost Model: Found an estimated cost of 41 for instruction: %V32i8 = srem <32 x i8> undef, +; AVX512F-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %V32i8 = srem <32 x i8> undef, ; AVX512F-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %V64i8 = srem <64 x i8> undef, ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; @@ -618,8 +618,8 @@ define i32 @srem_uniformconstpow2() { ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32i16 = srem <32 x i16> undef, ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %I8 = srem i8 undef, 16 ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 37 for instruction: %V16i8 = srem <16 x i8> undef, -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 39 for instruction: %V32i8 = srem <32 x i8> undef, -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V64i8 = srem <64 x i8> undef, +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %V32i8 = srem <32 x i8> undef, +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 39 for instruction: %V64i8 = srem <64 x i8> undef, ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; SLM-LABEL: 'srem_uniformconstpow2' diff --git a/llvm/test/Analysis/CostModel/X86/rem-sizelatency.ll b/llvm/test/Analysis/CostModel/X86/rem-sizelatency.ll index 2dc4c823cb539..817a0c7e070ac 100644 --- a/llvm/test/Analysis/CostModel/X86/rem-sizelatency.ll +++ b/llvm/test/Analysis/CostModel/X86/rem-sizelatency.ll @@ -323,8 +323,8 @@ define i32 @srem_constpow2() { ; AVX1-NEXT: Cost Model: Found an estimated cost of 324 for instruction: %V32i16 = srem <32 x i16> undef, ; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %I8 = srem i8 undef, 16 ; AVX1-NEXT: Cost Model: Found an estimated cost of 104 for instruction: %V16i8 = srem <16 x i8> undef, -; AVX1-NEXT: Cost Model: Found an estimated cost of 230 for instruction: %V32i8 = srem <32 x i8> undef, -; AVX1-NEXT: Cost Model: Found an estimated cost of 460 for instruction: %V64i8 = srem <64 x i8> undef, +; AVX1-NEXT: Cost Model: Found an estimated cost of 226 for instruction: %V32i8 = srem <32 x i8> undef, +; AVX1-NEXT: Cost Model: Found an estimated cost of 452 for instruction: %V64i8 = srem <64 x i8> undef, ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; AVX2-LABEL: 'srem_constpow2' @@ -342,8 +342,8 @@ define i32 @srem_constpow2() { ; AVX2-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V32i16 = srem <32 x i16> undef, ; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %I8 = srem i8 undef, 16 ; AVX2-NEXT: Cost Model: Found an estimated cost of 92 for instruction: %V16i8 = srem <16 x i8> undef, -; AVX2-NEXT: Cost Model: Found an estimated cost of 134 for instruction: %V32i8 = srem <32 x i8> undef, -; AVX2-NEXT: Cost Model: Found an estimated cost of 268 for instruction: %V64i8 = srem <64 x i8> undef, +; AVX2-NEXT: Cost Model: Found an estimated cost of 130 for instruction: %V32i8 = srem <32 x i8> undef, +; AVX2-NEXT: Cost Model: Found an estimated cost of 260 for instruction: %V64i8 = srem <64 x i8> undef, ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; AVX512F-LABEL: 'srem_constpow2' @@ -361,7 +361,7 @@ define i32 @srem_constpow2() { ; AVX512F-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V32i16 = srem <32 x i16> undef, ; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %I8 = srem i8 undef, 16 ; AVX512F-NEXT: Cost Model: Found an estimated cost of 92 for instruction: %V16i8 = srem <16 x i8> undef, -; AVX512F-NEXT: Cost Model: Found an estimated cost of 134 for instruction: %V32i8 = srem <32 x i8> undef, +; AVX512F-NEXT: Cost Model: Found an estimated cost of 130 for instruction: %V32i8 = srem <32 x i8> undef, ; AVX512F-NEXT: Cost Model: Found an estimated cost of 173 for instruction: %V64i8 = srem <64 x i8> undef, ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; @@ -380,8 +380,8 @@ define i32 @srem_constpow2() { ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32i16 = srem <32 x i16> undef, ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %I8 = srem i8 undef, 16 ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V16i8 = srem <16 x i8> undef, -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 91 for instruction: %V32i8 = srem <32 x i8> undef, -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 91 for instruction: %V64i8 = srem <64 x i8> undef, +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 90 for instruction: %V32i8 = srem <32 x i8> undef, +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 90 for instruction: %V64i8 = srem <64 x i8> undef, ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; SLM-LABEL: 'srem_constpow2' @@ -561,8 +561,8 @@ define i32 @srem_uniformconstpow2() { ; AVX1-NEXT: Cost Model: Found an estimated cost of 66 for instruction: %V32i16 = srem <32 x i16> undef, ; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %I8 = srem i8 undef, 16 ; AVX1-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %V16i8 = srem <16 x i8> undef, -; AVX1-NEXT: Cost Model: Found an estimated cost of 69 for instruction: %V32i8 = srem <32 x i8> undef, -; AVX1-NEXT: Cost Model: Found an estimated cost of 138 for instruction: %V64i8 = srem <64 x i8> undef, +; AVX1-NEXT: Cost Model: Found an estimated cost of 65 for instruction: %V32i8 = srem <32 x i8> undef, +; AVX1-NEXT: Cost Model: Found an estimated cost of 130 for instruction: %V64i8 = srem <64 x i8> undef, ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; AVX2-LABEL: 'srem_uniformconstpow2' @@ -580,8 +580,8 @@ define i32 @srem_uniformconstpow2() { ; AVX2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32i16 = srem <32 x i16> undef, ; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %I8 = srem i8 undef, 16 ; AVX2-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %V16i8 = srem <16 x i8> undef, -; AVX2-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V32i8 = srem <32 x i8> undef, -; AVX2-NEXT: Cost Model: Found an estimated cost of 92 for instruction: %V64i8 = srem <64 x i8> undef, +; AVX2-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %V32i8 = srem <32 x i8> undef, +; AVX2-NEXT: Cost Model: Found an estimated cost of 84 for instruction: %V64i8 = srem <64 x i8> undef, ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; AVX512F-LABEL: 'srem_uniformconstpow2' @@ -599,7 +599,7 @@ define i32 @srem_uniformconstpow2() { ; AVX512F-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V32i16 = srem <32 x i16> undef, ; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %I8 = srem i8 undef, 16 ; AVX512F-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %V16i8 = srem <16 x i8> undef, -; AVX512F-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V32i8 = srem <32 x i8> undef, +; AVX512F-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %V32i8 = srem <32 x i8> undef, ; AVX512F-NEXT: Cost Model: Found an estimated cost of 41 for instruction: %V64i8 = srem <64 x i8> undef, ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; @@ -618,8 +618,8 @@ define i32 @srem_uniformconstpow2() { ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32i16 = srem <32 x i16> undef, ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %I8 = srem i8 undef, 16 ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V16i8 = srem <16 x i8> undef, -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V32i8 = srem <32 x i8> undef, -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V64i8 = srem <64 x i8> undef, +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V32i8 = srem <32 x i8> undef, +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V64i8 = srem <64 x i8> undef, ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; SLM-LABEL: 'srem_uniformconstpow2' diff --git a/llvm/test/Analysis/CostModel/X86/rem.ll b/llvm/test/Analysis/CostModel/X86/rem.ll index 7ca17b2af033a..0e799d63741aa 100644 --- a/llvm/test/Analysis/CostModel/X86/rem.ll +++ b/llvm/test/Analysis/CostModel/X86/rem.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py ; RUN: opt < %s -mtriple=x86_64-apple-macosx10.8.0 -passes="print" 2>&1 -disable-output -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2 -; RUN: opt < %s -mtriple=x86_64-apple-macosx10.8.0 -passes="print" 2>&1 -disable-output -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE,SSE2 +; RUN: opt < %s -mtriple=x86_64-apple-macosx10.8.0 -passes="print" 2>&1 -disable-output -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE,SSSE3 ; RUN: opt < %s -mtriple=x86_64-apple-macosx10.8.0 -passes="print" 2>&1 -disable-output -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE,SSE42 ; RUN: opt < %s -mtriple=x86_64-apple-macosx10.8.0 -passes="print" 2>&1 -disable-output -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1 ; RUN: opt < %s -mtriple=x86_64-apple-macosx10.8.0 -passes="print" 2>&1 -disable-output -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2 @@ -193,6 +193,25 @@ define i32 @srem_const() { ; SSE2-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V64i8 = srem <64 x i8> undef, ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; +; SSSE3-LABEL: 'srem_const' +; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I64 = srem i64 undef, 7 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %V2i64 = srem <2 x i64> undef, +; SSSE3-NEXT: Cost Model: Found an estimated cost of 320 for instruction: %V4i64 = srem <4 x i64> undef, +; SSSE3-NEXT: Cost Model: Found an estimated cost of 640 for instruction: %V8i64 = srem <8 x i64> undef, +; SSSE3-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I32 = srem i32 undef, 7 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V4i32 = srem <4 x i32> undef, +; SSSE3-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V8i32 = srem <8 x i32> undef, +; SSSE3-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V16i32 = srem <16 x i32> undef, +; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I16 = srem i16 undef, 7 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8i16 = srem <8 x i16> undef, +; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16i16 = srem <16 x i16> undef, +; SSSE3-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V32i16 = srem <32 x i16> undef, +; SSSE3-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %I8 = srem i8 undef, 7 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16i8 = srem <16 x i8> undef, +; SSSE3-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V32i8 = srem <32 x i8> undef, +; SSSE3-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V64i8 = srem <64 x i8> undef, +; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; ; SSE42-LABEL: 'srem_const' ; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I64 = srem i64 undef, 7 ; SSE42-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %V2i64 = srem <2 x i64> undef, @@ -726,6 +745,25 @@ define i32 @srem_constpow2() { ; SSE2-NEXT: Cost Model: Found an estimated cost of 304 for instruction: %V64i8 = srem <64 x i8> undef, ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; +; SSSE3-LABEL: 'srem_constpow2' +; SSSE3-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %I64 = srem i64 undef, 16 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %V2i64 = srem <2 x i64> undef, +; SSSE3-NEXT: Cost Model: Found an estimated cost of 58 for instruction: %V4i64 = srem <4 x i64> undef, +; SSSE3-NEXT: Cost Model: Found an estimated cost of 116 for instruction: %V8i64 = srem <8 x i64> undef, +; SSSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %I32 = srem i32 undef, 16 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V4i32 = srem <4 x i32> undef, +; SSSE3-NEXT: Cost Model: Found an estimated cost of 88 for instruction: %V8i32 = srem <8 x i32> undef, +; SSSE3-NEXT: Cost Model: Found an estimated cost of 176 for instruction: %V16i32 = srem <16 x i32> undef, +; SSSE3-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %I16 = srem i16 undef, 16 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %V8i16 = srem <8 x i16> undef, +; SSSE3-NEXT: Cost Model: Found an estimated cost of 102 for instruction: %V16i16 = srem <16 x i16> undef, +; SSSE3-NEXT: Cost Model: Found an estimated cost of 204 for instruction: %V32i16 = srem <32 x i16> undef, +; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %I8 = srem i8 undef, 16 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 75 for instruction: %V16i8 = srem <16 x i8> undef, +; SSSE3-NEXT: Cost Model: Found an estimated cost of 150 for instruction: %V32i8 = srem <32 x i8> undef, +; SSSE3-NEXT: Cost Model: Found an estimated cost of 300 for instruction: %V64i8 = srem <64 x i8> undef, +; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; ; SSE42-LABEL: 'srem_constpow2' ; SSE42-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %I64 = srem i64 undef, 16 ; SSE42-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V2i64 = srem <2 x i64> undef, @@ -740,9 +778,9 @@ define i32 @srem_constpow2() { ; SSE42-NEXT: Cost Model: Found an estimated cost of 138 for instruction: %V16i16 = srem <16 x i16> undef, ; SSE42-NEXT: Cost Model: Found an estimated cost of 276 for instruction: %V32i16 = srem <32 x i16> undef, ; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %I8 = srem i8 undef, 16 -; SSE42-NEXT: Cost Model: Found an estimated cost of 100 for instruction: %V16i8 = srem <16 x i8> undef, -; SSE42-NEXT: Cost Model: Found an estimated cost of 200 for instruction: %V32i8 = srem <32 x i8> undef, -; SSE42-NEXT: Cost Model: Found an estimated cost of 400 for instruction: %V64i8 = srem <64 x i8> undef, +; SSE42-NEXT: Cost Model: Found an estimated cost of 99 for instruction: %V16i8 = srem <16 x i8> undef, +; SSE42-NEXT: Cost Model: Found an estimated cost of 198 for instruction: %V32i8 = srem <32 x i8> undef, +; SSE42-NEXT: Cost Model: Found an estimated cost of 396 for instruction: %V64i8 = srem <64 x i8> undef, ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX1-LABEL: 'srem_constpow2' @@ -760,8 +798,8 @@ define i32 @srem_constpow2() { ; AVX1-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %V32i16 = srem <32 x i16> undef, ; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %I8 = srem i8 undef, 16 ; AVX1-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %V16i8 = srem <16 x i8> undef, -; AVX1-NEXT: Cost Model: Found an estimated cost of 131 for instruction: %V32i8 = srem <32 x i8> undef, -; AVX1-NEXT: Cost Model: Found an estimated cost of 262 for instruction: %V64i8 = srem <64 x i8> undef, +; AVX1-NEXT: Cost Model: Found an estimated cost of 129 for instruction: %V32i8 = srem <32 x i8> undef, +; AVX1-NEXT: Cost Model: Found an estimated cost of 258 for instruction: %V64i8 = srem <64 x i8> undef, ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX2-LABEL: 'srem_constpow2' @@ -779,8 +817,8 @@ define i32 @srem_constpow2() { ; AVX2-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V32i16 = srem <32 x i16> undef, ; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %I8 = srem i8 undef, 16 ; AVX2-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %V16i8 = srem <16 x i8> undef, -; AVX2-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %V32i8 = srem <32 x i8> undef, -; AVX2-NEXT: Cost Model: Found an estimated cost of 104 for instruction: %V64i8 = srem <64 x i8> undef, +; AVX2-NEXT: Cost Model: Found an estimated cost of 50 for instruction: %V32i8 = srem <32 x i8> undef, +; AVX2-NEXT: Cost Model: Found an estimated cost of 100 for instruction: %V64i8 = srem <64 x i8> undef, ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512F-LABEL: 'srem_constpow2' @@ -798,7 +836,7 @@ define i32 @srem_constpow2() { ; AVX512F-NEXT: Cost Model: Found an estimated cost of 41 for instruction: %V32i16 = srem <32 x i16> undef, ; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %I8 = srem i8 undef, 16 ; AVX512F-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %V16i8 = srem <16 x i8> undef, -; AVX512F-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %V32i8 = srem <32 x i8> undef, +; AVX512F-NEXT: Cost Model: Found an estimated cost of 50 for instruction: %V32i8 = srem <32 x i8> undef, ; AVX512F-NEXT: Cost Model: Found an estimated cost of 97 for instruction: %V64i8 = srem <64 x i8> undef, ; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; @@ -817,8 +855,8 @@ define i32 @srem_constpow2() { ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32i16 = srem <32 x i16> undef, ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %I8 = srem i8 undef, 16 ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V16i8 = srem <16 x i8> undef, -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32i8 = srem <32 x i8> undef, -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 45 for instruction: %V64i8 = srem <64 x i8> undef, +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V32i8 = srem <32 x i8> undef, +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %V64i8 = srem <64 x i8> undef, ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SLM-LABEL: 'srem_constpow2' @@ -835,9 +873,9 @@ define i32 @srem_constpow2() { ; SLM-NEXT: Cost Model: Found an estimated cost of 140 for instruction: %V16i16 = srem <16 x i16> undef, ; SLM-NEXT: Cost Model: Found an estimated cost of 280 for instruction: %V32i16 = srem <32 x i16> undef, ; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %I8 = srem i8 undef, 16 -; SLM-NEXT: Cost Model: Found an estimated cost of 100 for instruction: %V16i8 = srem <16 x i8> undef, -; SLM-NEXT: Cost Model: Found an estimated cost of 200 for instruction: %V32i8 = srem <32 x i8> undef, -; SLM-NEXT: Cost Model: Found an estimated cost of 400 for instruction: %V64i8 = srem <64 x i8> undef, +; SLM-NEXT: Cost Model: Found an estimated cost of 99 for instruction: %V16i8 = srem <16 x i8> undef, +; SLM-NEXT: Cost Model: Found an estimated cost of 198 for instruction: %V32i8 = srem <32 x i8> undef, +; SLM-NEXT: Cost Model: Found an estimated cost of 396 for instruction: %V64i8 = srem <64 x i8> undef, ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; GLM-LABEL: 'srem_constpow2' @@ -854,9 +892,9 @@ define i32 @srem_constpow2() { ; GLM-NEXT: Cost Model: Found an estimated cost of 138 for instruction: %V16i16 = srem <16 x i16> undef, ; GLM-NEXT: Cost Model: Found an estimated cost of 276 for instruction: %V32i16 = srem <32 x i16> undef, ; GLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %I8 = srem i8 undef, 16 -; GLM-NEXT: Cost Model: Found an estimated cost of 100 for instruction: %V16i8 = srem <16 x i8> undef, -; GLM-NEXT: Cost Model: Found an estimated cost of 200 for instruction: %V32i8 = srem <32 x i8> undef, -; GLM-NEXT: Cost Model: Found an estimated cost of 400 for instruction: %V64i8 = srem <64 x i8> undef, +; GLM-NEXT: Cost Model: Found an estimated cost of 99 for instruction: %V16i8 = srem <16 x i8> undef, +; GLM-NEXT: Cost Model: Found an estimated cost of 198 for instruction: %V32i8 = srem <32 x i8> undef, +; GLM-NEXT: Cost Model: Found an estimated cost of 396 for instruction: %V64i8 = srem <64 x i8> undef, ; GLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; %I64 = srem i64 undef, 16 @@ -983,6 +1021,25 @@ define i32 @srem_uniformconstpow2() { ; SSE2-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %V64i8 = srem <64 x i8> undef, ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; +; SSSE3-LABEL: 'srem_uniformconstpow2' +; SSSE3-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %I64 = srem i64 undef, 16 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V2i64 = srem <2 x i64> undef, +; SSSE3-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V4i64 = srem <4 x i64> undef, +; SSSE3-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V8i64 = srem <8 x i64> undef, +; SSSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %I32 = srem i32 undef, 16 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V4i32 = srem <4 x i32> undef, +; SSSE3-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V8i32 = srem <8 x i32> undef, +; SSSE3-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V16i32 = srem <16 x i32> undef, +; SSSE3-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %I16 = srem i16 undef, 16 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8i16 = srem <8 x i16> undef, +; SSSE3-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16i16 = srem <16 x i16> undef, +; SSSE3-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32i16 = srem <32 x i16> undef, +; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %I8 = srem i8 undef, 16 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V16i8 = srem <16 x i8> undef, +; SSSE3-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V32i8 = srem <32 x i8> undef, +; SSSE3-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V64i8 = srem <64 x i8> undef, +; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; ; SSE42-LABEL: 'srem_uniformconstpow2' ; SSE42-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %I64 = srem i64 undef, 16 ; SSE42-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V2i64 = srem <2 x i64> undef, @@ -997,9 +1054,9 @@ define i32 @srem_uniformconstpow2() { ; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16i16 = srem <16 x i16> undef, ; SSE42-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32i16 = srem <32 x i16> undef, ; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %I8 = srem i8 undef, 16 -; SSE42-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V16i8 = srem <16 x i8> undef, -; SSE42-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %V32i8 = srem <32 x i8> undef, -; SSE42-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %V64i8 = srem <64 x i8> undef, +; SSE42-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V16i8 = srem <16 x i8> undef, +; SSE42-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V32i8 = srem <32 x i8> undef, +; SSE42-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V64i8 = srem <64 x i8> undef, ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX1-LABEL: 'srem_uniformconstpow2' @@ -1017,8 +1074,8 @@ define i32 @srem_uniformconstpow2() { ; AVX1-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %V32i16 = srem <32 x i16> undef, ; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %I8 = srem i8 undef, 16 ; AVX1-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V16i8 = srem <16 x i8> undef, -; AVX1-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %V32i8 = srem <32 x i8> undef, -; AVX1-NEXT: Cost Model: Found an estimated cost of 76 for instruction: %V64i8 = srem <64 x i8> undef, +; AVX1-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V32i8 = srem <32 x i8> undef, +; AVX1-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %V64i8 = srem <64 x i8> undef, ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX2-LABEL: 'srem_uniformconstpow2' @@ -1036,8 +1093,8 @@ define i32 @srem_uniformconstpow2() { ; AVX2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V32i16 = srem <32 x i16> undef, ; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %I8 = srem i8 undef, 16 ; AVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16i8 = srem <16 x i8> undef, -; AVX2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32i8 = srem <32 x i8> undef, -; AVX2-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V64i8 = srem <64 x i8> undef, +; AVX2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V32i8 = srem <32 x i8> undef, +; AVX2-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V64i8 = srem <64 x i8> undef, ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512F-LABEL: 'srem_uniformconstpow2' @@ -1055,7 +1112,7 @@ define i32 @srem_uniformconstpow2() { ; AVX512F-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V32i16 = srem <32 x i16> undef, ; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %I8 = srem i8 undef, 16 ; AVX512F-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16i8 = srem <16 x i8> undef, -; AVX512F-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32i8 = srem <32 x i8> undef, +; AVX512F-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V32i8 = srem <32 x i8> undef, ; AVX512F-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64i8 = srem <64 x i8> undef, ; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; @@ -1074,8 +1131,8 @@ define i32 @srem_uniformconstpow2() { ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32i16 = srem <32 x i16> undef, ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %I8 = srem i8 undef, 16 ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16i8 = srem <16 x i8> undef, -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32i8 = srem <32 x i8> undef, -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V64i8 = srem <64 x i8> undef, +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32i8 = srem <32 x i8> undef, +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64i8 = srem <64 x i8> undef, ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SLM-LABEL: 'srem_uniformconstpow2' @@ -1092,9 +1149,9 @@ define i32 @srem_uniformconstpow2() { ; SLM-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V16i16 = srem <16 x i16> undef, ; SLM-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V32i16 = srem <32 x i16> undef, ; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %I8 = srem i8 undef, 16 -; SLM-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V16i8 = srem <16 x i8> undef, -; SLM-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %V32i8 = srem <32 x i8> undef, -; SLM-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %V64i8 = srem <64 x i8> undef, +; SLM-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V16i8 = srem <16 x i8> undef, +; SLM-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V32i8 = srem <32 x i8> undef, +; SLM-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V64i8 = srem <64 x i8> undef, ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; GLM-LABEL: 'srem_uniformconstpow2' @@ -1111,9 +1168,9 @@ define i32 @srem_uniformconstpow2() { ; GLM-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16i16 = srem <16 x i16> undef, ; GLM-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32i16 = srem <32 x i16> undef, ; GLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %I8 = srem i8 undef, 16 -; GLM-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V16i8 = srem <16 x i8> undef, -; GLM-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %V32i8 = srem <32 x i8> undef, -; GLM-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %V64i8 = srem <64 x i8> undef, +; GLM-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V16i8 = srem <16 x i8> undef, +; GLM-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V32i8 = srem <32 x i8> undef, +; GLM-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V64i8 = srem <64 x i8> undef, ; GLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; %I64 = srem i64 undef, 16 @@ -1240,6 +1297,25 @@ define i32 @srem_constnegpow2() { ; SSE2-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V64i8 = srem <64 x i8> undef, ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; +; SSSE3-LABEL: 'srem_constnegpow2' +; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I64 = srem i64 undef, -16 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %V2i64 = srem <2 x i64> undef, +; SSSE3-NEXT: Cost Model: Found an estimated cost of 320 for instruction: %V4i64 = srem <4 x i64> undef, +; SSSE3-NEXT: Cost Model: Found an estimated cost of 640 for instruction: %V8i64 = srem <8 x i64> undef, +; SSSE3-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I32 = srem i32 undef, -16 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V4i32 = srem <4 x i32> undef, +; SSSE3-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V8i32 = srem <8 x i32> undef, +; SSSE3-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V16i32 = srem <16 x i32> undef, +; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I16 = srem i16 undef, -16 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8i16 = srem <8 x i16> undef, +; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16i16 = srem <16 x i16> undef, +; SSSE3-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V32i16 = srem <32 x i16> undef, +; SSSE3-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %I8 = srem i8 undef, -16 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16i8 = srem <16 x i8> undef, +; SSSE3-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V32i8 = srem <32 x i8> undef, +; SSSE3-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V64i8 = srem <64 x i8> undef, +; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; ; SSE42-LABEL: 'srem_constnegpow2' ; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I64 = srem i64 undef, -16 ; SSE42-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %V2i64 = srem <2 x i64> undef, diff --git a/llvm/test/Analysis/CostModel/X86/slm-arith-costs.ll b/llvm/test/Analysis/CostModel/X86/slm-arith-costs.ll index 251af66ef7be0..204290bd4e40f 100644 --- a/llvm/test/Analysis/CostModel/X86/slm-arith-costs.ll +++ b/llvm/test/Analysis/CostModel/X86/slm-arith-costs.ll @@ -192,7 +192,7 @@ entry: define <16 x i8> @slm-costs_8_v16_mul(<16 x i8> %a, <16 x i8> %b) { ; CHECK-LABEL: 'slm-costs_8_v16_mul' -; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %res = mul nsw <16 x i8> %a, %b +; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %res = mul nsw <16 x i8> %a, %b ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %res ; entry: diff --git a/llvm/test/CodeGen/X86/avx2-arith.ll b/llvm/test/CodeGen/X86/avx2-arith.ll index d8452d105ae55..90733dfb8465e 100644 --- a/llvm/test/CodeGen/X86/avx2-arith.ll +++ b/llvm/test/CodeGen/X86/avx2-arith.ll @@ -121,16 +121,14 @@ define <16 x i8> @mul_v16i8(<16 x i8> %i, <16 x i8> %j) nounwind readnone { define <32 x i8> @mul_v32i8(<32 x i8> %i, <32 x i8> %j) nounwind readnone { ; CHECK-LABEL: mul_v32i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] -; CHECK-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] -; CHECK-NEXT: vpmullw %ymm2, %ymm3, %ymm2 -; CHECK-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; CHECK-NEXT: vpand %ymm3, %ymm2, %ymm2 -; CHECK-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] -; CHECK-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] -; CHECK-NEXT: vpmullw %ymm1, %ymm0, %ymm0 -; CHECK-NEXT: vpand %ymm3, %ymm0, %ymm0 -; CHECK-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 +; CHECK-NEXT: vpbroadcastw {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; CHECK-NEXT: vpand %ymm1, %ymm2, %ymm3 +; CHECK-NEXT: vpmaddubsw %ymm3, %ymm0, %ymm3 +; CHECK-NEXT: vpand %ymm2, %ymm3, %ymm3 +; CHECK-NEXT: vpandn %ymm1, %ymm2, %ymm1 +; CHECK-NEXT: vpmaddubsw %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: vpsllw $8, %ymm0, %ymm0 +; CHECK-NEXT: vpor %ymm0, %ymm3, %ymm0 ; CHECK-NEXT: ret{{[l|q]}} %x = mul <32 x i8> %i, %j ret <32 x i8> %x diff --git a/llvm/test/CodeGen/X86/midpoint-int-vec-128.ll b/llvm/test/CodeGen/X86/midpoint-int-vec-128.ll index 3b5ff12fb4eca..acaa9be3f89a2 100644 --- a/llvm/test/CodeGen/X86/midpoint-int-vec-128.ll +++ b/llvm/test/CodeGen/X86/midpoint-int-vec-128.ll @@ -2368,17 +2368,17 @@ define <16 x i8> @vec128_i8_signed_reg_reg(<16 x i8> %a1, <16 x i8> %a2) nounwin ; SSE41-NEXT: psubb %xmm3, %xmm1 ; SSE41-NEXT: psrlw $1, %xmm1 ; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; SSE41-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero -; SSE41-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE41-NEXT: pmullw %xmm1, %xmm2 -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] -; SSE41-NEXT: pand %xmm1, %xmm2 -; SSE41-NEXT: pmullw %xmm4, %xmm3 -; SSE41-NEXT: pand %xmm1, %xmm3 -; SSE41-NEXT: packuswb %xmm2, %xmm3 -; SSE41-NEXT: paddb %xmm3, %xmm0 +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] +; SSE41-NEXT: movdqa %xmm3, %xmm4 +; SSE41-NEXT: pand %xmm2, %xmm4 +; SSE41-NEXT: movdqa %xmm1, %xmm5 +; SSE41-NEXT: pmaddubsw %xmm4, %xmm5 +; SSE41-NEXT: pand %xmm3, %xmm5 +; SSE41-NEXT: pandn %xmm2, %xmm3 +; SSE41-NEXT: pmaddubsw %xmm3, %xmm1 +; SSE41-NEXT: psllw $8, %xmm1 +; SSE41-NEXT: por %xmm1, %xmm5 +; SSE41-NEXT: paddb %xmm5, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: vec128_i8_signed_reg_reg: @@ -2390,16 +2390,14 @@ define <16 x i8> @vec128_i8_signed_reg_reg(<16 x i8> %a1, <16 x i8> %a2) nounwin ; AVX1-NEXT: vpsubb %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm1 ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX1-NEXT: vpmullw %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] -; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero -; AVX1-NEXT: vpmullw %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1 -; AVX1-NEXT: vpackuswb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vpand %xmm2, %xmm3, %xmm4 +; AVX1-NEXT: vpmaddubsw %xmm4, %xmm1, %xmm4 +; AVX1-NEXT: vpand %xmm3, %xmm4, %xmm4 +; AVX1-NEXT: vpandn %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpmaddubsw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpsllw $8, %xmm1, %xmm1 +; AVX1-NEXT: vpor %xmm1, %xmm4, %xmm1 ; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -2431,13 +2429,12 @@ define <16 x i8> @vec128_i8_signed_reg_reg(<16 x i8> %a1, <16 x i8> %a2) nounwin ; XOP-FALLBACK-NEXT: vpsubb %xmm3, %xmm1, %xmm1 ; XOP-FALLBACK-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 ; XOP-FALLBACK-NEXT: vpshlb %xmm3, %xmm1, %xmm1 -; XOP-FALLBACK-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; XOP-FALLBACK-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; XOP-FALLBACK-NEXT: vpmullw %xmm4, %xmm3, %xmm3 -; XOP-FALLBACK-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; XOP-FALLBACK-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero -; XOP-FALLBACK-NEXT: vpmullw %xmm2, %xmm1, %xmm1 -; XOP-FALLBACK-NEXT: vpperm {{.*#+}} xmm1 = xmm1[0,2,4,6,8,10,12,14],xmm3[0,2,4,6,8,10,12,14] +; XOP-FALLBACK-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] +; XOP-FALLBACK-NEXT: vpandn %xmm2, %xmm3, %xmm4 +; XOP-FALLBACK-NEXT: vpmaddubsw %xmm4, %xmm1, %xmm4 +; XOP-FALLBACK-NEXT: vpand %xmm2, %xmm3, %xmm2 +; XOP-FALLBACK-NEXT: vpmaddubsw %xmm2, %xmm1, %xmm1 +; XOP-FALLBACK-NEXT: vpperm {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[2],xmm4[2],xmm1[4],xmm4[4],xmm1[6],xmm4[6],xmm1[8],xmm4[8],xmm1[10],xmm4[10],xmm1[12],xmm4[12],xmm1[14],xmm4[14] ; XOP-FALLBACK-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; XOP-FALLBACK-NEXT: retq ; @@ -2450,13 +2447,12 @@ define <16 x i8> @vec128_i8_signed_reg_reg(<16 x i8> %a1, <16 x i8> %a2) nounwin ; XOPAVX1-NEXT: vpsubb %xmm3, %xmm1, %xmm1 ; XOPAVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 ; XOPAVX1-NEXT: vpshlb %xmm3, %xmm1, %xmm1 -; XOPAVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; XOPAVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; XOPAVX1-NEXT: vpmullw %xmm4, %xmm3, %xmm3 -; XOPAVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; XOPAVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero -; XOPAVX1-NEXT: vpmullw %xmm2, %xmm1, %xmm1 -; XOPAVX1-NEXT: vpperm {{.*#+}} xmm1 = xmm1[0,2,4,6,8,10,12,14],xmm3[0,2,4,6,8,10,12,14] +; XOPAVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] +; XOPAVX1-NEXT: vpandn %xmm2, %xmm3, %xmm4 +; XOPAVX1-NEXT: vpmaddubsw %xmm4, %xmm1, %xmm4 +; XOPAVX1-NEXT: vpand %xmm2, %xmm3, %xmm2 +; XOPAVX1-NEXT: vpmaddubsw %xmm2, %xmm1, %xmm1 +; XOPAVX1-NEXT: vpperm {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[2],xmm4[2],xmm1[4],xmm4[4],xmm1[6],xmm4[6],xmm1[8],xmm4[8],xmm1[10],xmm4[10],xmm1[12],xmm4[12],xmm1[14],xmm4[14] ; XOPAVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; XOPAVX1-NEXT: retq ; @@ -2584,28 +2580,28 @@ define <16 x i8> @vec128_i8_unsigned_reg_reg(<16 x i8> %a1, <16 x i8> %a2) nounw ; ; SSE41-LABEL: vec128_i8_unsigned_reg_reg: ; SSE41: # %bb.0: +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: pminub %xmm1, %xmm2 ; SSE41-NEXT: movdqa %xmm0, %xmm3 -; SSE41-NEXT: pminub %xmm1, %xmm3 -; SSE41-NEXT: movdqa %xmm0, %xmm4 -; SSE41-NEXT: pcmpeqb %xmm3, %xmm4 -; SSE41-NEXT: pcmpeqd %xmm2, %xmm2 -; SSE41-NEXT: pxor %xmm4, %xmm2 -; SSE41-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; SSE41-NEXT: pcmpeqb %xmm2, %xmm3 +; SSE41-NEXT: pcmpeqd %xmm4, %xmm4 +; SSE41-NEXT: pxor %xmm3, %xmm4 +; SSE41-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 ; SSE41-NEXT: pmaxub %xmm0, %xmm1 -; SSE41-NEXT: psubb %xmm3, %xmm1 +; SSE41-NEXT: psubb %xmm2, %xmm1 ; SSE41-NEXT: psrlw $1, %xmm1 ; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; SSE41-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero -; SSE41-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE41-NEXT: pmullw %xmm1, %xmm2 -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] -; SSE41-NEXT: pand %xmm1, %xmm2 -; SSE41-NEXT: pmullw %xmm4, %xmm3 -; SSE41-NEXT: pand %xmm1, %xmm3 -; SSE41-NEXT: packuswb %xmm2, %xmm3 -; SSE41-NEXT: paddb %xmm3, %xmm0 +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] +; SSE41-NEXT: movdqa %xmm2, %xmm3 +; SSE41-NEXT: pand %xmm4, %xmm3 +; SSE41-NEXT: movdqa %xmm1, %xmm5 +; SSE41-NEXT: pmaddubsw %xmm3, %xmm5 +; SSE41-NEXT: pand %xmm2, %xmm5 +; SSE41-NEXT: pandn %xmm4, %xmm2 +; SSE41-NEXT: pmaddubsw %xmm2, %xmm1 +; SSE41-NEXT: psllw $8, %xmm1 +; SSE41-NEXT: por %xmm1, %xmm5 +; SSE41-NEXT: paddb %xmm5, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: vec128_i8_unsigned_reg_reg: @@ -2619,16 +2615,14 @@ define <16 x i8> @vec128_i8_unsigned_reg_reg(<16 x i8> %a1, <16 x i8> %a2) nounw ; AVX1-NEXT: vpsubb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm1 ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX1-NEXT: vpmullw %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] -; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero -; AVX1-NEXT: vpmullw %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1 -; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm4 +; AVX1-NEXT: vpmaddubsw %xmm4, %xmm1, %xmm4 +; AVX1-NEXT: vpand %xmm2, %xmm4, %xmm4 +; AVX1-NEXT: vpandn %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpmaddubsw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpsllw $8, %xmm1, %xmm1 +; AVX1-NEXT: vpor %xmm1, %xmm4, %xmm1 ; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -2662,13 +2656,12 @@ define <16 x i8> @vec128_i8_unsigned_reg_reg(<16 x i8> %a1, <16 x i8> %a2) nounw ; XOP-FALLBACK-NEXT: vpsubb %xmm3, %xmm1, %xmm1 ; XOP-FALLBACK-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 ; XOP-FALLBACK-NEXT: vpshlb %xmm3, %xmm1, %xmm1 -; XOP-FALLBACK-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; XOP-FALLBACK-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; XOP-FALLBACK-NEXT: vpmullw %xmm4, %xmm3, %xmm3 -; XOP-FALLBACK-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; XOP-FALLBACK-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero -; XOP-FALLBACK-NEXT: vpmullw %xmm2, %xmm1, %xmm1 -; XOP-FALLBACK-NEXT: vpperm {{.*#+}} xmm1 = xmm1[0,2,4,6,8,10,12,14],xmm3[0,2,4,6,8,10,12,14] +; XOP-FALLBACK-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] +; XOP-FALLBACK-NEXT: vpandn %xmm2, %xmm3, %xmm4 +; XOP-FALLBACK-NEXT: vpmaddubsw %xmm4, %xmm1, %xmm4 +; XOP-FALLBACK-NEXT: vpand %xmm2, %xmm3, %xmm2 +; XOP-FALLBACK-NEXT: vpmaddubsw %xmm2, %xmm1, %xmm1 +; XOP-FALLBACK-NEXT: vpperm {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[2],xmm4[2],xmm1[4],xmm4[4],xmm1[6],xmm4[6],xmm1[8],xmm4[8],xmm1[10],xmm4[10],xmm1[12],xmm4[12],xmm1[14],xmm4[14] ; XOP-FALLBACK-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; XOP-FALLBACK-NEXT: retq ; @@ -2681,13 +2674,12 @@ define <16 x i8> @vec128_i8_unsigned_reg_reg(<16 x i8> %a1, <16 x i8> %a2) nounw ; XOPAVX1-NEXT: vpsubb %xmm3, %xmm1, %xmm1 ; XOPAVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 ; XOPAVX1-NEXT: vpshlb %xmm3, %xmm1, %xmm1 -; XOPAVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; XOPAVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; XOPAVX1-NEXT: vpmullw %xmm4, %xmm3, %xmm3 -; XOPAVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; XOPAVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero -; XOPAVX1-NEXT: vpmullw %xmm2, %xmm1, %xmm1 -; XOPAVX1-NEXT: vpperm {{.*#+}} xmm1 = xmm1[0,2,4,6,8,10,12,14],xmm3[0,2,4,6,8,10,12,14] +; XOPAVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] +; XOPAVX1-NEXT: vpandn %xmm2, %xmm3, %xmm4 +; XOPAVX1-NEXT: vpmaddubsw %xmm4, %xmm1, %xmm4 +; XOPAVX1-NEXT: vpand %xmm2, %xmm3, %xmm2 +; XOPAVX1-NEXT: vpmaddubsw %xmm2, %xmm1, %xmm1 +; XOPAVX1-NEXT: vpperm {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[2],xmm4[2],xmm1[4],xmm4[4],xmm1[6],xmm4[6],xmm1[8],xmm4[8],xmm1[10],xmm4[10],xmm1[12],xmm4[12],xmm1[14],xmm4[14] ; XOPAVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; XOPAVX1-NEXT: retq ; @@ -2820,28 +2812,27 @@ define <16 x i8> @vec128_i8_signed_mem_reg(ptr %a1_addr, <16 x i8> %a2) nounwind ; ; SSE41-LABEL: vec128_i8_signed_mem_reg: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa (%rdi), %xmm2 -; SSE41-NEXT: movdqa %xmm2, %xmm3 -; SSE41-NEXT: pcmpgtb %xmm0, %xmm3 -; SSE41-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 -; SSE41-NEXT: movdqa %xmm2, %xmm1 -; SSE41-NEXT: pminsb %xmm0, %xmm1 -; SSE41-NEXT: pmaxsb %xmm2, %xmm0 -; SSE41-NEXT: psubb %xmm1, %xmm0 +; SSE41-NEXT: movdqa (%rdi), %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm2 +; SSE41-NEXT: pcmpgtb %xmm0, %xmm2 +; SSE41-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; SSE41-NEXT: movdqa %xmm1, %xmm3 +; SSE41-NEXT: pminsb %xmm0, %xmm3 +; SSE41-NEXT: pmaxsb %xmm1, %xmm0 +; SSE41-NEXT: psubb %xmm3, %xmm0 ; SSE41-NEXT: psrlw $1, %xmm0 ; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm4 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero -; SSE41-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE41-NEXT: pmullw %xmm0, %xmm3 -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255] -; SSE41-NEXT: pand %xmm0, %xmm3 -; SSE41-NEXT: pmullw %xmm4, %xmm1 -; SSE41-NEXT: pand %xmm0, %xmm1 -; SSE41-NEXT: packuswb %xmm3, %xmm1 -; SSE41-NEXT: paddb %xmm2, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] +; SSE41-NEXT: movdqa %xmm3, %xmm4 +; SSE41-NEXT: pand %xmm2, %xmm4 +; SSE41-NEXT: movdqa %xmm0, %xmm5 +; SSE41-NEXT: pmaddubsw %xmm4, %xmm5 +; SSE41-NEXT: pand %xmm3, %xmm5 +; SSE41-NEXT: pandn %xmm2, %xmm3 +; SSE41-NEXT: pmaddubsw %xmm3, %xmm0 +; SSE41-NEXT: psllw $8, %xmm0 +; SSE41-NEXT: por %xmm5, %xmm0 +; SSE41-NEXT: paddb %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: vec128_i8_signed_mem_reg: @@ -2854,16 +2845,14 @@ define <16 x i8> @vec128_i8_signed_mem_reg(ptr %a1_addr, <16 x i8> %a2) nounwind ; AVX1-NEXT: vpsubb %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm0 ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX1-NEXT: vpmullw %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] -; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero -; AVX1-NEXT: vpmullw %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vpand %xmm2, %xmm3, %xmm4 +; AVX1-NEXT: vpmaddubsw %xmm4, %xmm0, %xmm4 +; AVX1-NEXT: vpand %xmm3, %xmm4, %xmm4 +; AVX1-NEXT: vpandn %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpmaddubsw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpsllw $8, %xmm0, %xmm0 +; AVX1-NEXT: vpor %xmm0, %xmm4, %xmm0 ; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; @@ -2897,13 +2886,12 @@ define <16 x i8> @vec128_i8_signed_mem_reg(ptr %a1_addr, <16 x i8> %a2) nounwind ; XOP-FALLBACK-NEXT: vpsubb %xmm3, %xmm0, %xmm0 ; XOP-FALLBACK-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 ; XOP-FALLBACK-NEXT: vpshlb %xmm3, %xmm0, %xmm0 -; XOP-FALLBACK-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; XOP-FALLBACK-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; XOP-FALLBACK-NEXT: vpmullw %xmm4, %xmm3, %xmm3 -; XOP-FALLBACK-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; XOP-FALLBACK-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero -; XOP-FALLBACK-NEXT: vpmullw %xmm2, %xmm0, %xmm0 -; XOP-FALLBACK-NEXT: vpperm {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],xmm3[0,2,4,6,8,10,12,14] +; XOP-FALLBACK-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] +; XOP-FALLBACK-NEXT: vpandn %xmm2, %xmm3, %xmm4 +; XOP-FALLBACK-NEXT: vpmaddubsw %xmm4, %xmm0, %xmm4 +; XOP-FALLBACK-NEXT: vpand %xmm2, %xmm3, %xmm2 +; XOP-FALLBACK-NEXT: vpmaddubsw %xmm2, %xmm0, %xmm0 +; XOP-FALLBACK-NEXT: vpperm {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[2],xmm4[2],xmm0[4],xmm4[4],xmm0[6],xmm4[6],xmm0[8],xmm4[8],xmm0[10],xmm4[10],xmm0[12],xmm4[12],xmm0[14],xmm4[14] ; XOP-FALLBACK-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ; XOP-FALLBACK-NEXT: retq ; @@ -2917,13 +2905,12 @@ define <16 x i8> @vec128_i8_signed_mem_reg(ptr %a1_addr, <16 x i8> %a2) nounwind ; XOPAVX1-NEXT: vpsubb %xmm3, %xmm0, %xmm0 ; XOPAVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 ; XOPAVX1-NEXT: vpshlb %xmm3, %xmm0, %xmm0 -; XOPAVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; XOPAVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; XOPAVX1-NEXT: vpmullw %xmm4, %xmm3, %xmm3 -; XOPAVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; XOPAVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero -; XOPAVX1-NEXT: vpmullw %xmm2, %xmm0, %xmm0 -; XOPAVX1-NEXT: vpperm {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],xmm3[0,2,4,6,8,10,12,14] +; XOPAVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] +; XOPAVX1-NEXT: vpandn %xmm2, %xmm3, %xmm4 +; XOPAVX1-NEXT: vpmaddubsw %xmm4, %xmm0, %xmm4 +; XOPAVX1-NEXT: vpand %xmm2, %xmm3, %xmm2 +; XOPAVX1-NEXT: vpmaddubsw %xmm2, %xmm0, %xmm0 +; XOPAVX1-NEXT: vpperm {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[2],xmm4[2],xmm0[4],xmm4[4],xmm0[6],xmm4[6],xmm0[8],xmm4[8],xmm0[10],xmm4[10],xmm0[12],xmm4[12],xmm0[14],xmm4[14] ; XOPAVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ; XOPAVX1-NEXT: retq ; @@ -3056,27 +3043,27 @@ define <16 x i8> @vec128_i8_signed_reg_mem(<16 x i8> %a1, ptr %a2_addr) nounwind ; ; SSE41-LABEL: vec128_i8_signed_reg_mem: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa (%rdi), %xmm2 -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: pcmpgtb %xmm2, %xmm1 -; SSE41-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE41-NEXT: movdqa (%rdi), %xmm1 +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: pcmpgtb %xmm1, %xmm2 +; SSE41-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 ; SSE41-NEXT: movdqa %xmm0, %xmm3 -; SSE41-NEXT: pminsb %xmm2, %xmm3 -; SSE41-NEXT: pmaxsb %xmm0, %xmm2 -; SSE41-NEXT: psubb %xmm3, %xmm2 -; SSE41-NEXT: psrlw $1, %xmm2 -; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero -; SSE41-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm4 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; SSE41-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE41-NEXT: pmullw %xmm2, %xmm1 -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] -; SSE41-NEXT: pand %xmm2, %xmm1 -; SSE41-NEXT: pmullw %xmm4, %xmm3 -; SSE41-NEXT: pand %xmm2, %xmm3 -; SSE41-NEXT: packuswb %xmm1, %xmm3 -; SSE41-NEXT: paddb %xmm3, %xmm0 +; SSE41-NEXT: pminsb %xmm1, %xmm3 +; SSE41-NEXT: pmaxsb %xmm0, %xmm1 +; SSE41-NEXT: psubb %xmm3, %xmm1 +; SSE41-NEXT: psrlw $1, %xmm1 +; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] +; SSE41-NEXT: movdqa %xmm3, %xmm4 +; SSE41-NEXT: pand %xmm2, %xmm4 +; SSE41-NEXT: movdqa %xmm1, %xmm5 +; SSE41-NEXT: pmaddubsw %xmm4, %xmm5 +; SSE41-NEXT: pand %xmm3, %xmm5 +; SSE41-NEXT: pandn %xmm2, %xmm3 +; SSE41-NEXT: pmaddubsw %xmm3, %xmm1 +; SSE41-NEXT: psllw $8, %xmm1 +; SSE41-NEXT: por %xmm5, %xmm1 +; SSE41-NEXT: paddb %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: vec128_i8_signed_reg_mem: @@ -3089,16 +3076,14 @@ define <16 x i8> @vec128_i8_signed_reg_mem(<16 x i8> %a1, ptr %a2_addr) nounwind ; AVX1-NEXT: vpsubb %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm1 ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX1-NEXT: vpmullw %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] -; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero -; AVX1-NEXT: vpmullw %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1 -; AVX1-NEXT: vpackuswb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vpand %xmm2, %xmm3, %xmm4 +; AVX1-NEXT: vpmaddubsw %xmm4, %xmm1, %xmm4 +; AVX1-NEXT: vpand %xmm3, %xmm4, %xmm4 +; AVX1-NEXT: vpandn %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpmaddubsw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpsllw $8, %xmm1, %xmm1 +; AVX1-NEXT: vpor %xmm1, %xmm4, %xmm1 ; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -3132,13 +3117,12 @@ define <16 x i8> @vec128_i8_signed_reg_mem(<16 x i8> %a1, ptr %a2_addr) nounwind ; XOP-FALLBACK-NEXT: vpsubb %xmm3, %xmm1, %xmm1 ; XOP-FALLBACK-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 ; XOP-FALLBACK-NEXT: vpshlb %xmm3, %xmm1, %xmm1 -; XOP-FALLBACK-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; XOP-FALLBACK-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; XOP-FALLBACK-NEXT: vpmullw %xmm4, %xmm3, %xmm3 -; XOP-FALLBACK-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; XOP-FALLBACK-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero -; XOP-FALLBACK-NEXT: vpmullw %xmm2, %xmm1, %xmm1 -; XOP-FALLBACK-NEXT: vpperm {{.*#+}} xmm1 = xmm1[0,2,4,6,8,10,12,14],xmm3[0,2,4,6,8,10,12,14] +; XOP-FALLBACK-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] +; XOP-FALLBACK-NEXT: vpandn %xmm2, %xmm3, %xmm4 +; XOP-FALLBACK-NEXT: vpmaddubsw %xmm4, %xmm1, %xmm4 +; XOP-FALLBACK-NEXT: vpand %xmm2, %xmm3, %xmm2 +; XOP-FALLBACK-NEXT: vpmaddubsw %xmm2, %xmm1, %xmm1 +; XOP-FALLBACK-NEXT: vpperm {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[2],xmm4[2],xmm1[4],xmm4[4],xmm1[6],xmm4[6],xmm1[8],xmm4[8],xmm1[10],xmm4[10],xmm1[12],xmm4[12],xmm1[14],xmm4[14] ; XOP-FALLBACK-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; XOP-FALLBACK-NEXT: retq ; @@ -3152,13 +3136,12 @@ define <16 x i8> @vec128_i8_signed_reg_mem(<16 x i8> %a1, ptr %a2_addr) nounwind ; XOPAVX1-NEXT: vpsubb %xmm3, %xmm1, %xmm1 ; XOPAVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 ; XOPAVX1-NEXT: vpshlb %xmm3, %xmm1, %xmm1 -; XOPAVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; XOPAVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; XOPAVX1-NEXT: vpmullw %xmm4, %xmm3, %xmm3 -; XOPAVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; XOPAVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero -; XOPAVX1-NEXT: vpmullw %xmm2, %xmm1, %xmm1 -; XOPAVX1-NEXT: vpperm {{.*#+}} xmm1 = xmm1[0,2,4,6,8,10,12,14],xmm3[0,2,4,6,8,10,12,14] +; XOPAVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] +; XOPAVX1-NEXT: vpandn %xmm2, %xmm3, %xmm4 +; XOPAVX1-NEXT: vpmaddubsw %xmm4, %xmm1, %xmm4 +; XOPAVX1-NEXT: vpand %xmm2, %xmm3, %xmm2 +; XOPAVX1-NEXT: vpmaddubsw %xmm2, %xmm1, %xmm1 +; XOPAVX1-NEXT: vpperm {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[2],xmm4[2],xmm1[4],xmm4[4],xmm1[6],xmm4[6],xmm1[8],xmm4[8],xmm1[10],xmm4[10],xmm1[12],xmm4[12],xmm1[14],xmm4[14] ; XOPAVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; XOPAVX1-NEXT: retq ; @@ -3293,26 +3276,26 @@ define <16 x i8> @vec128_i8_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind ; SSE41-LABEL: vec128_i8_signed_mem_mem: ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa (%rdi), %xmm1 -; SSE41-NEXT: movdqa (%rsi), %xmm3 +; SSE41-NEXT: movdqa (%rsi), %xmm0 ; SSE41-NEXT: movdqa %xmm1, %xmm2 -; SSE41-NEXT: pcmpgtb %xmm3, %xmm2 +; SSE41-NEXT: pcmpgtb %xmm0, %xmm2 ; SSE41-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: pminsb %xmm3, %xmm0 -; SSE41-NEXT: pmaxsb %xmm1, %xmm3 -; SSE41-NEXT: psubb %xmm0, %xmm3 -; SSE41-NEXT: psrlw $1, %xmm3 -; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero -; SSE41-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero -; SSE41-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE41-NEXT: pmullw %xmm3, %xmm2 +; SSE41-NEXT: movdqa %xmm1, %xmm3 +; SSE41-NEXT: pminsb %xmm0, %xmm3 +; SSE41-NEXT: pmaxsb %xmm1, %xmm0 +; SSE41-NEXT: psubb %xmm3, %xmm0 +; SSE41-NEXT: psrlw $1, %xmm0 +; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] -; SSE41-NEXT: pand %xmm3, %xmm2 -; SSE41-NEXT: pmullw %xmm4, %xmm0 -; SSE41-NEXT: pand %xmm3, %xmm0 -; SSE41-NEXT: packuswb %xmm2, %xmm0 +; SSE41-NEXT: movdqa %xmm3, %xmm4 +; SSE41-NEXT: pand %xmm2, %xmm4 +; SSE41-NEXT: movdqa %xmm0, %xmm5 +; SSE41-NEXT: pmaddubsw %xmm4, %xmm5 +; SSE41-NEXT: pand %xmm3, %xmm5 +; SSE41-NEXT: pandn %xmm2, %xmm3 +; SSE41-NEXT: pmaddubsw %xmm3, %xmm0 +; SSE41-NEXT: psllw $8, %xmm0 +; SSE41-NEXT: por %xmm5, %xmm0 ; SSE41-NEXT: paddb %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -3327,16 +3310,14 @@ define <16 x i8> @vec128_i8_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind ; AVX1-NEXT: vpsubb %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm1 ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX1-NEXT: vpmullw %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] -; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero -; AVX1-NEXT: vpmullw %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1 -; AVX1-NEXT: vpackuswb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vpand %xmm2, %xmm3, %xmm4 +; AVX1-NEXT: vpmaddubsw %xmm4, %xmm1, %xmm4 +; AVX1-NEXT: vpand %xmm3, %xmm4, %xmm4 +; AVX1-NEXT: vpandn %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpmaddubsw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpsllw $8, %xmm1, %xmm1 +; AVX1-NEXT: vpor %xmm1, %xmm4, %xmm1 ; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; @@ -3372,13 +3353,12 @@ define <16 x i8> @vec128_i8_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind ; XOP-FALLBACK-NEXT: vpsubb %xmm3, %xmm1, %xmm1 ; XOP-FALLBACK-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 ; XOP-FALLBACK-NEXT: vpshlb %xmm3, %xmm1, %xmm1 -; XOP-FALLBACK-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; XOP-FALLBACK-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; XOP-FALLBACK-NEXT: vpmullw %xmm4, %xmm3, %xmm3 -; XOP-FALLBACK-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; XOP-FALLBACK-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero -; XOP-FALLBACK-NEXT: vpmullw %xmm2, %xmm1, %xmm1 -; XOP-FALLBACK-NEXT: vpperm {{.*#+}} xmm1 = xmm1[0,2,4,6,8,10,12,14],xmm3[0,2,4,6,8,10,12,14] +; XOP-FALLBACK-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] +; XOP-FALLBACK-NEXT: vpandn %xmm2, %xmm3, %xmm4 +; XOP-FALLBACK-NEXT: vpmaddubsw %xmm4, %xmm1, %xmm4 +; XOP-FALLBACK-NEXT: vpand %xmm2, %xmm3, %xmm2 +; XOP-FALLBACK-NEXT: vpmaddubsw %xmm2, %xmm1, %xmm1 +; XOP-FALLBACK-NEXT: vpperm {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[2],xmm4[2],xmm1[4],xmm4[4],xmm1[6],xmm4[6],xmm1[8],xmm4[8],xmm1[10],xmm4[10],xmm1[12],xmm4[12],xmm1[14],xmm4[14] ; XOP-FALLBACK-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; XOP-FALLBACK-NEXT: retq ; @@ -3393,13 +3373,12 @@ define <16 x i8> @vec128_i8_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind ; XOPAVX1-NEXT: vpsubb %xmm3, %xmm1, %xmm1 ; XOPAVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 ; XOPAVX1-NEXT: vpshlb %xmm3, %xmm1, %xmm1 -; XOPAVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; XOPAVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; XOPAVX1-NEXT: vpmullw %xmm4, %xmm3, %xmm3 -; XOPAVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; XOPAVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero -; XOPAVX1-NEXT: vpmullw %xmm2, %xmm1, %xmm1 -; XOPAVX1-NEXT: vpperm {{.*#+}} xmm1 = xmm1[0,2,4,6,8,10,12,14],xmm3[0,2,4,6,8,10,12,14] +; XOPAVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] +; XOPAVX1-NEXT: vpandn %xmm2, %xmm3, %xmm4 +; XOPAVX1-NEXT: vpmaddubsw %xmm4, %xmm1, %xmm4 +; XOPAVX1-NEXT: vpand %xmm2, %xmm3, %xmm2 +; XOPAVX1-NEXT: vpmaddubsw %xmm2, %xmm1, %xmm1 +; XOPAVX1-NEXT: vpperm {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[2],xmm4[2],xmm1[4],xmm4[4],xmm1[6],xmm4[6],xmm1[8],xmm4[8],xmm1[10],xmm4[10],xmm1[12],xmm4[12],xmm1[14],xmm4[14] ; XOPAVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; XOPAVX1-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll b/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll index 92060aec30748..7d882b772a64d 100644 --- a/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll +++ b/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll @@ -1911,28 +1911,24 @@ define <32 x i8> @vec256_i8_signed_reg_reg(<32 x i8> %a1, <32 x i8> %a2) nounwin ; AVX1-NEXT: vpand %xmm6, %xmm3, %xmm3 ; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm1 ; AVX1-NEXT: vpand %xmm6, %xmm1, %xmm1 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] -; AVX1-NEXT: vpor %xmm7, %xmm5, %xmm5 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX1-NEXT: vpmullw %xmm6, %xmm8, %xmm6 -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255] -; AVX1-NEXT: vpand %xmm6, %xmm8, %xmm6 -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero -; AVX1-NEXT: vpmullw %xmm5, %xmm1, %xmm1 -; AVX1-NEXT: vpand %xmm1, %xmm8, %xmm1 -; AVX1-NEXT: vpackuswb %xmm6, %xmm1, %xmm1 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX1-NEXT: vpor %xmm7, %xmm4, %xmm4 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX1-NEXT: vpmullw %xmm6, %xmm5, %xmm5 -; AVX1-NEXT: vpand %xmm5, %xmm8, %xmm5 -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero -; AVX1-NEXT: vpmullw %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vpand %xmm3, %xmm8, %xmm3 -; AVX1-NEXT: vpackuswb %xmm5, %xmm3, %xmm3 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX1-NEXT: vpor %xmm6, %xmm5, %xmm5 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vpand %xmm5, %xmm7, %xmm8 +; AVX1-NEXT: vpmaddubsw %xmm8, %xmm1, %xmm8 +; AVX1-NEXT: vpand %xmm7, %xmm8, %xmm8 +; AVX1-NEXT: vpandn %xmm5, %xmm7, %xmm5 +; AVX1-NEXT: vpmaddubsw %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vpsllw $8, %xmm1, %xmm1 +; AVX1-NEXT: vpor %xmm1, %xmm8, %xmm1 +; AVX1-NEXT: vpor %xmm6, %xmm4, %xmm4 +; AVX1-NEXT: vpand %xmm4, %xmm7, %xmm5 +; AVX1-NEXT: vpmaddubsw %xmm5, %xmm3, %xmm5 +; AVX1-NEXT: vpand %xmm7, %xmm5, %xmm5 +; AVX1-NEXT: vpandn %xmm4, %xmm7, %xmm4 +; AVX1-NEXT: vpmaddubsw %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpsllw $8, %xmm3, %xmm3 +; AVX1-NEXT: vpor %xmm3, %xmm5, %xmm3 ; AVX1-NEXT: vpaddb %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 @@ -1947,16 +1943,14 @@ define <32 x i8> @vec256_i8_signed_reg_reg(<32 x i8> %a1, <32 x i8> %a2) nounwin ; AVX2-NEXT: vpsubb %ymm3, %ymm1, %ymm1 ; AVX2-NEXT: vpsrlw $1, %ymm1, %ymm1 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] -; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] -; AVX2-NEXT: vpmullw %ymm4, %ymm3, %ymm3 -; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3 -; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] -; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] -; AVX2-NEXT: vpmullw %ymm2, %ymm1, %ymm1 -; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1 -; AVX2-NEXT: vpackuswb %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpand %ymm2, %ymm3, %ymm4 +; AVX2-NEXT: vpmaddubsw %ymm4, %ymm1, %ymm4 +; AVX2-NEXT: vpand %ymm3, %ymm4, %ymm4 +; AVX2-NEXT: vpandn %ymm2, %ymm3, %ymm2 +; AVX2-NEXT: vpmaddubsw %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpsllw $8, %ymm1, %ymm1 +; AVX2-NEXT: vpor %ymm1, %ymm4, %ymm1 ; AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; @@ -1975,23 +1969,20 @@ define <32 x i8> @vec256_i8_signed_reg_reg(<32 x i8> %a1, <32 x i8> %a2) nounwin ; XOP-NEXT: vpcmpeqd %xmm6, %xmm6, %xmm6 ; XOP-NEXT: vpshlb %xmm6, %xmm2, %xmm2 ; XOP-NEXT: vpshlb %xmm6, %xmm1, %xmm1 -; XOP-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; XOP-NEXT: vbroadcastss {{.*#+}} xmm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] -; XOP-NEXT: vpor %xmm7, %xmm5, %xmm5 -; XOP-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; XOP-NEXT: vpmullw %xmm6, %xmm8, %xmm6 -; XOP-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; XOP-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero -; XOP-NEXT: vpmullw %xmm5, %xmm1, %xmm1 -; XOP-NEXT: vmovdqa {{.*#+}} xmm5 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30] -; XOP-NEXT: vpperm %xmm5, %xmm6, %xmm1, %xmm1 -; XOP-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; XOP-NEXT: vpor %xmm7, %xmm4, %xmm4 -; XOP-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; XOP-NEXT: vpmullw %xmm7, %xmm6, %xmm6 -; XOP-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero -; XOP-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero -; XOP-NEXT: vpmullw %xmm4, %xmm2, %xmm2 +; XOP-NEXT: vbroadcastss {{.*#+}} xmm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; XOP-NEXT: vpor %xmm6, %xmm5, %xmm5 +; XOP-NEXT: vbroadcastss {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255] +; XOP-NEXT: vpandn %xmm5, %xmm7, %xmm8 +; XOP-NEXT: vpmaddubsw %xmm8, %xmm1, %xmm8 +; XOP-NEXT: vpand %xmm5, %xmm7, %xmm5 +; XOP-NEXT: vpmaddubsw %xmm5, %xmm1, %xmm1 +; XOP-NEXT: vmovdqa {{.*#+}} xmm5 = [0,16,2,18,4,20,6,22,8,24,10,26,12,28,14,30] +; XOP-NEXT: vpperm %xmm5, %xmm8, %xmm1, %xmm1 +; XOP-NEXT: vpor %xmm6, %xmm4, %xmm4 +; XOP-NEXT: vpandn %xmm4, %xmm7, %xmm6 +; XOP-NEXT: vpmaddubsw %xmm6, %xmm2, %xmm6 +; XOP-NEXT: vpand %xmm4, %xmm7, %xmm4 +; XOP-NEXT: vpmaddubsw %xmm4, %xmm2, %xmm2 ; XOP-NEXT: vpperm %xmm5, %xmm6, %xmm2, %xmm2 ; XOP-NEXT: vpaddb %xmm3, %xmm2, %xmm2 ; XOP-NEXT: vpaddb %xmm0, %xmm1, %xmm0 @@ -2007,16 +1998,14 @@ define <32 x i8> @vec256_i8_signed_reg_reg(<32 x i8> %a1, <32 x i8> %a2) nounwin ; AVX512F-NEXT: vpsubb %ymm3, %ymm1, %ymm1 ; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] -; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] -; AVX512F-NEXT: vpmullw %ymm4, %ymm3, %ymm3 -; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-NEXT: vpand %ymm4, %ymm3, %ymm3 -; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] -; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] -; AVX512F-NEXT: vpmullw %ymm2, %ymm1, %ymm1 -; AVX512F-NEXT: vpand %ymm4, %ymm1, %ymm1 -; AVX512F-NEXT: vpackuswb %ymm3, %ymm1, %ymm1 +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpand %ymm2, %ymm3, %ymm4 +; AVX512F-NEXT: vpmaddubsw %ymm4, %ymm1, %ymm4 +; AVX512F-NEXT: vpand %ymm3, %ymm4, %ymm4 +; AVX512F-NEXT: vpandn %ymm2, %ymm3, %ymm2 +; AVX512F-NEXT: vpmaddubsw %ymm2, %ymm1, %ymm1 +; AVX512F-NEXT: vpsllw $8, %ymm1, %ymm1 +; AVX512F-NEXT: vpor %ymm1, %ymm4, %ymm1 ; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: retq ; @@ -2096,28 +2085,24 @@ define <32 x i8> @vec256_i8_unsigned_reg_reg(<32 x i8> %a1, <32 x i8> %a2) nounw ; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3 ; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm1 ; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] -; AVX1-NEXT: vpor %xmm7, %xmm6, %xmm6 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX1-NEXT: vpmullw %xmm4, %xmm8, %xmm4 -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255] -; AVX1-NEXT: vpand %xmm4, %xmm8, %xmm4 -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero -; AVX1-NEXT: vpmullw %xmm6, %xmm1, %xmm1 -; AVX1-NEXT: vpand %xmm1, %xmm8, %xmm1 -; AVX1-NEXT: vpackuswb %xmm4, %xmm1, %xmm1 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX1-NEXT: vpor %xmm7, %xmm5, %xmm5 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX1-NEXT: vpmullw %xmm6, %xmm4, %xmm4 -; AVX1-NEXT: vpand %xmm4, %xmm8, %xmm4 -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero -; AVX1-NEXT: vpmullw %xmm5, %xmm3, %xmm3 -; AVX1-NEXT: vpand %xmm3, %xmm8, %xmm3 -; AVX1-NEXT: vpackuswb %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX1-NEXT: vpor %xmm4, %xmm6, %xmm6 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vpand %xmm6, %xmm7, %xmm8 +; AVX1-NEXT: vpmaddubsw %xmm8, %xmm1, %xmm8 +; AVX1-NEXT: vpand %xmm7, %xmm8, %xmm8 +; AVX1-NEXT: vpandn %xmm6, %xmm7, %xmm6 +; AVX1-NEXT: vpmaddubsw %xmm6, %xmm1, %xmm1 +; AVX1-NEXT: vpsllw $8, %xmm1, %xmm1 +; AVX1-NEXT: vpor %xmm1, %xmm8, %xmm1 +; AVX1-NEXT: vpor %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vpand %xmm4, %xmm7, %xmm5 +; AVX1-NEXT: vpmaddubsw %xmm5, %xmm3, %xmm5 +; AVX1-NEXT: vpand %xmm7, %xmm5, %xmm5 +; AVX1-NEXT: vpandn %xmm4, %xmm7, %xmm4 +; AVX1-NEXT: vpmaddubsw %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpsllw $8, %xmm3, %xmm3 +; AVX1-NEXT: vpor %xmm3, %xmm5, %xmm3 ; AVX1-NEXT: vpaddb %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 @@ -2134,16 +2119,14 @@ define <32 x i8> @vec256_i8_unsigned_reg_reg(<32 x i8> %a1, <32 x i8> %a2) nounw ; AVX2-NEXT: vpsubb %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpsrlw $1, %ymm1, %ymm1 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] -; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] -; AVX2-NEXT: vpmullw %ymm4, %ymm2, %ymm2 -; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2 -; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] -; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] -; AVX2-NEXT: vpmullw %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1 -; AVX2-NEXT: vpackuswb %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm4 +; AVX2-NEXT: vpmaddubsw %ymm4, %ymm1, %ymm4 +; AVX2-NEXT: vpand %ymm2, %ymm4, %ymm4 +; AVX2-NEXT: vpandn %ymm3, %ymm2, %ymm2 +; AVX2-NEXT: vpmaddubsw %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpsllw $8, %ymm1, %ymm1 +; AVX2-NEXT: vpor %ymm1, %ymm4, %ymm1 ; AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; @@ -2162,23 +2145,20 @@ define <32 x i8> @vec256_i8_unsigned_reg_reg(<32 x i8> %a1, <32 x i8> %a2) nounw ; XOP-NEXT: vpcmpeqd %xmm6, %xmm6, %xmm6 ; XOP-NEXT: vpshlb %xmm6, %xmm2, %xmm2 ; XOP-NEXT: vpshlb %xmm6, %xmm1, %xmm1 -; XOP-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; XOP-NEXT: vbroadcastss {{.*#+}} xmm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] -; XOP-NEXT: vpor %xmm7, %xmm5, %xmm5 -; XOP-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; XOP-NEXT: vpmullw %xmm6, %xmm8, %xmm6 -; XOP-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; XOP-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero -; XOP-NEXT: vpmullw %xmm5, %xmm1, %xmm1 -; XOP-NEXT: vmovdqa {{.*#+}} xmm5 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30] -; XOP-NEXT: vpperm %xmm5, %xmm6, %xmm1, %xmm1 -; XOP-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; XOP-NEXT: vpor %xmm7, %xmm4, %xmm4 -; XOP-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; XOP-NEXT: vpmullw %xmm7, %xmm6, %xmm6 -; XOP-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero -; XOP-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero -; XOP-NEXT: vpmullw %xmm4, %xmm2, %xmm2 +; XOP-NEXT: vbroadcastss {{.*#+}} xmm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; XOP-NEXT: vpor %xmm6, %xmm5, %xmm5 +; XOP-NEXT: vbroadcastss {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255] +; XOP-NEXT: vpandn %xmm5, %xmm7, %xmm8 +; XOP-NEXT: vpmaddubsw %xmm8, %xmm1, %xmm8 +; XOP-NEXT: vpand %xmm5, %xmm7, %xmm5 +; XOP-NEXT: vpmaddubsw %xmm5, %xmm1, %xmm1 +; XOP-NEXT: vmovdqa {{.*#+}} xmm5 = [0,16,2,18,4,20,6,22,8,24,10,26,12,28,14,30] +; XOP-NEXT: vpperm %xmm5, %xmm8, %xmm1, %xmm1 +; XOP-NEXT: vpor %xmm6, %xmm4, %xmm4 +; XOP-NEXT: vpandn %xmm4, %xmm7, %xmm6 +; XOP-NEXT: vpmaddubsw %xmm6, %xmm2, %xmm6 +; XOP-NEXT: vpand %xmm4, %xmm7, %xmm4 +; XOP-NEXT: vpmaddubsw %xmm4, %xmm2, %xmm2 ; XOP-NEXT: vpperm %xmm5, %xmm6, %xmm2, %xmm2 ; XOP-NEXT: vpaddb %xmm3, %xmm2, %xmm2 ; XOP-NEXT: vpaddb %xmm0, %xmm1, %xmm0 @@ -2195,16 +2175,14 @@ define <32 x i8> @vec256_i8_unsigned_reg_reg(<32 x i8> %a1, <32 x i8> %a2) nounw ; AVX512F-NEXT: vpsubb %ymm2, %ymm1, %ymm1 ; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] -; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] -; AVX512F-NEXT: vpmullw %ymm4, %ymm2, %ymm2 -; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-NEXT: vpand %ymm4, %ymm2, %ymm2 -; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] -; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] -; AVX512F-NEXT: vpmullw %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vpand %ymm4, %ymm1, %ymm1 -; AVX512F-NEXT: vpackuswb %ymm2, %ymm1, %ymm1 +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm4 +; AVX512F-NEXT: vpmaddubsw %ymm4, %ymm1, %ymm4 +; AVX512F-NEXT: vpand %ymm2, %ymm4, %ymm4 +; AVX512F-NEXT: vpandn %ymm3, %ymm2, %ymm2 +; AVX512F-NEXT: vpmaddubsw %ymm2, %ymm1, %ymm1 +; AVX512F-NEXT: vpsllw $8, %ymm1, %ymm1 +; AVX512F-NEXT: vpor %ymm1, %ymm4, %ymm1 ; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: retq ; @@ -2285,28 +2263,24 @@ define <32 x i8> @vec256_i8_signed_mem_reg(ptr %a1_addr, <32 x i8> %a2) nounwind ; AVX1-NEXT: vpand %xmm6, %xmm3, %xmm3 ; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm0 ; AVX1-NEXT: vpand %xmm6, %xmm0, %xmm0 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] -; AVX1-NEXT: vpor %xmm7, %xmm5, %xmm5 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX1-NEXT: vpmullw %xmm6, %xmm8, %xmm6 -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255] -; AVX1-NEXT: vpand %xmm6, %xmm8, %xmm6 -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero -; AVX1-NEXT: vpmullw %xmm5, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm0, %xmm8, %xmm0 -; AVX1-NEXT: vpackuswb %xmm6, %xmm0, %xmm0 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX1-NEXT: vpor %xmm7, %xmm4, %xmm4 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX1-NEXT: vpmullw %xmm6, %xmm5, %xmm5 -; AVX1-NEXT: vpand %xmm5, %xmm8, %xmm5 -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero -; AVX1-NEXT: vpmullw %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vpand %xmm3, %xmm8, %xmm3 -; AVX1-NEXT: vpackuswb %xmm5, %xmm3, %xmm3 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX1-NEXT: vpor %xmm6, %xmm5, %xmm5 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vpand %xmm5, %xmm7, %xmm8 +; AVX1-NEXT: vpmaddubsw %xmm8, %xmm0, %xmm8 +; AVX1-NEXT: vpand %xmm7, %xmm8, %xmm8 +; AVX1-NEXT: vpandn %xmm5, %xmm7, %xmm5 +; AVX1-NEXT: vpmaddubsw %xmm5, %xmm0, %xmm0 +; AVX1-NEXT: vpsllw $8, %xmm0, %xmm0 +; AVX1-NEXT: vpor %xmm0, %xmm8, %xmm0 +; AVX1-NEXT: vpor %xmm6, %xmm4, %xmm4 +; AVX1-NEXT: vpand %xmm4, %xmm7, %xmm5 +; AVX1-NEXT: vpmaddubsw %xmm5, %xmm3, %xmm5 +; AVX1-NEXT: vpand %xmm7, %xmm5, %xmm5 +; AVX1-NEXT: vpandn %xmm4, %xmm7, %xmm4 +; AVX1-NEXT: vpmaddubsw %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpsllw $8, %xmm3, %xmm3 +; AVX1-NEXT: vpor %xmm3, %xmm5, %xmm3 ; AVX1-NEXT: vpaddb %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 @@ -2322,56 +2296,51 @@ define <32 x i8> @vec256_i8_signed_mem_reg(ptr %a1_addr, <32 x i8> %a2) nounwind ; AVX2-NEXT: vpsubb %ymm3, %ymm0, %ymm0 ; AVX2-NEXT: vpsrlw $1, %ymm0, %ymm0 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] -; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] -; AVX2-NEXT: vpmullw %ymm4, %ymm3, %ymm3 -; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3 -; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] -; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] -; AVX2-NEXT: vpmullw %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0 -; AVX2-NEXT: vpackuswb %ymm3, %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpand %ymm2, %ymm3, %ymm4 +; AVX2-NEXT: vpmaddubsw %ymm4, %ymm0, %ymm4 +; AVX2-NEXT: vpand %ymm3, %ymm4, %ymm4 +; AVX2-NEXT: vpandn %ymm2, %ymm3, %ymm2 +; AVX2-NEXT: vpmaddubsw %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpsllw $8, %ymm0, %ymm0 +; AVX2-NEXT: vpor %ymm0, %ymm4, %ymm0 ; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; XOP-LABEL: vec256_i8_signed_mem_reg: ; XOP: # %bb.0: -; XOP-NEXT: vextractf128 $1, %ymm0, %xmm2 -; XOP-NEXT: vmovdqa (%rdi), %xmm1 +; XOP-NEXT: vextractf128 $1, %ymm0, %xmm1 +; XOP-NEXT: vmovdqa (%rdi), %xmm2 ; XOP-NEXT: vmovdqa 16(%rdi), %xmm3 -; XOP-NEXT: vpcomgtb %xmm2, %xmm3, %xmm4 -; XOP-NEXT: vpcomgtb %xmm0, %xmm1, %xmm5 -; XOP-NEXT: vpminsb %xmm0, %xmm1, %xmm6 -; XOP-NEXT: vpmaxsb %xmm0, %xmm1, %xmm0 +; XOP-NEXT: vpcomgtb %xmm1, %xmm3, %xmm4 +; XOP-NEXT: vpcomgtb %xmm0, %xmm2, %xmm5 +; XOP-NEXT: vpminsb %xmm0, %xmm2, %xmm6 +; XOP-NEXT: vpmaxsb %xmm0, %xmm2, %xmm0 ; XOP-NEXT: vpsubb %xmm6, %xmm0, %xmm0 -; XOP-NEXT: vpminsb %xmm2, %xmm3, %xmm6 -; XOP-NEXT: vpmaxsb %xmm2, %xmm3, %xmm2 -; XOP-NEXT: vpsubb %xmm6, %xmm2, %xmm2 +; XOP-NEXT: vpminsb %xmm1, %xmm3, %xmm6 +; XOP-NEXT: vpmaxsb %xmm1, %xmm3, %xmm1 +; XOP-NEXT: vpsubb %xmm6, %xmm1, %xmm1 ; XOP-NEXT: vpcmpeqd %xmm6, %xmm6, %xmm6 -; XOP-NEXT: vpshlb %xmm6, %xmm2, %xmm2 +; XOP-NEXT: vpshlb %xmm6, %xmm1, %xmm1 ; XOP-NEXT: vpshlb %xmm6, %xmm0, %xmm0 -; XOP-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; XOP-NEXT: vbroadcastss {{.*#+}} xmm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] -; XOP-NEXT: vpor %xmm7, %xmm5, %xmm5 -; XOP-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; XOP-NEXT: vpmullw %xmm6, %xmm8, %xmm6 -; XOP-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; XOP-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero -; XOP-NEXT: vpmullw %xmm5, %xmm0, %xmm0 -; XOP-NEXT: vmovdqa {{.*#+}} xmm5 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30] -; XOP-NEXT: vpperm %xmm5, %xmm6, %xmm0, %xmm0 -; XOP-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; XOP-NEXT: vpor %xmm7, %xmm4, %xmm4 -; XOP-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; XOP-NEXT: vpmullw %xmm7, %xmm6, %xmm6 -; XOP-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero -; XOP-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero -; XOP-NEXT: vpmullw %xmm4, %xmm2, %xmm2 -; XOP-NEXT: vpperm %xmm5, %xmm6, %xmm2, %xmm2 -; XOP-NEXT: vpaddb %xmm3, %xmm2, %xmm2 -; XOP-NEXT: vpaddb %xmm1, %xmm0, %xmm0 -; XOP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; XOP-NEXT: vbroadcastss {{.*#+}} xmm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; XOP-NEXT: vpor %xmm6, %xmm5, %xmm5 +; XOP-NEXT: vbroadcastss {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255] +; XOP-NEXT: vpandn %xmm5, %xmm7, %xmm8 +; XOP-NEXT: vpmaddubsw %xmm8, %xmm0, %xmm8 +; XOP-NEXT: vpand %xmm5, %xmm7, %xmm5 +; XOP-NEXT: vpmaddubsw %xmm5, %xmm0, %xmm0 +; XOP-NEXT: vmovdqa {{.*#+}} xmm5 = [0,16,2,18,4,20,6,22,8,24,10,26,12,28,14,30] +; XOP-NEXT: vpperm %xmm5, %xmm8, %xmm0, %xmm0 +; XOP-NEXT: vpor %xmm6, %xmm4, %xmm4 +; XOP-NEXT: vpandn %xmm4, %xmm7, %xmm6 +; XOP-NEXT: vpmaddubsw %xmm6, %xmm1, %xmm6 +; XOP-NEXT: vpand %xmm4, %xmm7, %xmm4 +; XOP-NEXT: vpmaddubsw %xmm4, %xmm1, %xmm1 +; XOP-NEXT: vpperm %xmm5, %xmm6, %xmm1, %xmm1 +; XOP-NEXT: vpaddb %xmm3, %xmm1, %xmm1 +; XOP-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; XOP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; XOP-NEXT: retq ; ; AVX512F-LABEL: vec256_i8_signed_mem_reg: @@ -2384,16 +2353,14 @@ define <32 x i8> @vec256_i8_signed_mem_reg(ptr %a1_addr, <32 x i8> %a2) nounwind ; AVX512F-NEXT: vpsubb %ymm3, %ymm0, %ymm0 ; AVX512F-NEXT: vpsrlw $1, %ymm0, %ymm0 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] -; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] -; AVX512F-NEXT: vpmullw %ymm4, %ymm3, %ymm3 -; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-NEXT: vpand %ymm4, %ymm3, %ymm3 -; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] -; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] -; AVX512F-NEXT: vpmullw %ymm2, %ymm0, %ymm0 -; AVX512F-NEXT: vpand %ymm4, %ymm0, %ymm0 -; AVX512F-NEXT: vpackuswb %ymm3, %ymm0, %ymm0 +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpand %ymm2, %ymm3, %ymm4 +; AVX512F-NEXT: vpmaddubsw %ymm4, %ymm0, %ymm4 +; AVX512F-NEXT: vpand %ymm3, %ymm4, %ymm4 +; AVX512F-NEXT: vpandn %ymm2, %ymm3, %ymm2 +; AVX512F-NEXT: vpmaddubsw %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: vpsllw $8, %ymm0, %ymm0 +; AVX512F-NEXT: vpor %ymm0, %ymm4, %ymm0 ; AVX512F-NEXT: vpaddb %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: retq ; @@ -2474,28 +2441,24 @@ define <32 x i8> @vec256_i8_signed_reg_mem(<32 x i8> %a1, ptr %a2_addr) nounwind ; AVX1-NEXT: vpand %xmm6, %xmm3, %xmm3 ; AVX1-NEXT: vpsrlw $1, %xmm2, %xmm2 ; AVX1-NEXT: vpand %xmm6, %xmm2, %xmm2 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] -; AVX1-NEXT: vpor %xmm7, %xmm5, %xmm5 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX1-NEXT: vpmullw %xmm6, %xmm8, %xmm6 -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255] -; AVX1-NEXT: vpand %xmm6, %xmm8, %xmm6 -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero -; AVX1-NEXT: vpmullw %xmm5, %xmm2, %xmm2 -; AVX1-NEXT: vpand %xmm2, %xmm8, %xmm2 -; AVX1-NEXT: vpackuswb %xmm6, %xmm2, %xmm2 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX1-NEXT: vpor %xmm7, %xmm4, %xmm4 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX1-NEXT: vpmullw %xmm6, %xmm5, %xmm5 -; AVX1-NEXT: vpand %xmm5, %xmm8, %xmm5 -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero -; AVX1-NEXT: vpmullw %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vpand %xmm3, %xmm8, %xmm3 -; AVX1-NEXT: vpackuswb %xmm5, %xmm3, %xmm3 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX1-NEXT: vpor %xmm6, %xmm5, %xmm5 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vpand %xmm5, %xmm7, %xmm8 +; AVX1-NEXT: vpmaddubsw %xmm8, %xmm2, %xmm8 +; AVX1-NEXT: vpand %xmm7, %xmm8, %xmm8 +; AVX1-NEXT: vpandn %xmm5, %xmm7, %xmm5 +; AVX1-NEXT: vpmaddubsw %xmm5, %xmm2, %xmm2 +; AVX1-NEXT: vpsllw $8, %xmm2, %xmm2 +; AVX1-NEXT: vpor %xmm2, %xmm8, %xmm2 +; AVX1-NEXT: vpor %xmm6, %xmm4, %xmm4 +; AVX1-NEXT: vpand %xmm4, %xmm7, %xmm5 +; AVX1-NEXT: vpmaddubsw %xmm5, %xmm3, %xmm5 +; AVX1-NEXT: vpand %xmm7, %xmm5, %xmm5 +; AVX1-NEXT: vpandn %xmm4, %xmm7, %xmm4 +; AVX1-NEXT: vpmaddubsw %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpsllw $8, %xmm3, %xmm3 +; AVX1-NEXT: vpor %xmm3, %xmm5, %xmm3 ; AVX1-NEXT: vpaddb %xmm1, %xmm3, %xmm1 ; AVX1-NEXT: vpaddb %xmm0, %xmm2, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 @@ -2511,16 +2474,14 @@ define <32 x i8> @vec256_i8_signed_reg_mem(<32 x i8> %a1, ptr %a2_addr) nounwind ; AVX2-NEXT: vpsubb %ymm3, %ymm1, %ymm1 ; AVX2-NEXT: vpsrlw $1, %ymm1, %ymm1 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] -; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] -; AVX2-NEXT: vpmullw %ymm4, %ymm3, %ymm3 -; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3 -; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] -; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] -; AVX2-NEXT: vpmullw %ymm2, %ymm1, %ymm1 -; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1 -; AVX2-NEXT: vpackuswb %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpand %ymm2, %ymm3, %ymm4 +; AVX2-NEXT: vpmaddubsw %ymm4, %ymm1, %ymm4 +; AVX2-NEXT: vpand %ymm3, %ymm4, %ymm4 +; AVX2-NEXT: vpandn %ymm2, %ymm3, %ymm2 +; AVX2-NEXT: vpmaddubsw %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpsllw $8, %ymm1, %ymm1 +; AVX2-NEXT: vpor %ymm1, %ymm4, %ymm1 ; AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; @@ -2540,23 +2501,20 @@ define <32 x i8> @vec256_i8_signed_reg_mem(<32 x i8> %a1, ptr %a2_addr) nounwind ; XOP-NEXT: vpcmpeqd %xmm6, %xmm6, %xmm6 ; XOP-NEXT: vpshlb %xmm6, %xmm2, %xmm2 ; XOP-NEXT: vpshlb %xmm6, %xmm1, %xmm1 -; XOP-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; XOP-NEXT: vbroadcastss {{.*#+}} xmm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] -; XOP-NEXT: vpor %xmm7, %xmm5, %xmm5 -; XOP-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; XOP-NEXT: vpmullw %xmm6, %xmm8, %xmm6 -; XOP-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; XOP-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero -; XOP-NEXT: vpmullw %xmm5, %xmm1, %xmm1 -; XOP-NEXT: vmovdqa {{.*#+}} xmm5 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30] -; XOP-NEXT: vpperm %xmm5, %xmm6, %xmm1, %xmm1 -; XOP-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; XOP-NEXT: vpor %xmm7, %xmm4, %xmm4 -; XOP-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; XOP-NEXT: vpmullw %xmm7, %xmm6, %xmm6 -; XOP-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero -; XOP-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero -; XOP-NEXT: vpmullw %xmm4, %xmm2, %xmm2 +; XOP-NEXT: vbroadcastss {{.*#+}} xmm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; XOP-NEXT: vpor %xmm6, %xmm5, %xmm5 +; XOP-NEXT: vbroadcastss {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255] +; XOP-NEXT: vpandn %xmm5, %xmm7, %xmm8 +; XOP-NEXT: vpmaddubsw %xmm8, %xmm1, %xmm8 +; XOP-NEXT: vpand %xmm5, %xmm7, %xmm5 +; XOP-NEXT: vpmaddubsw %xmm5, %xmm1, %xmm1 +; XOP-NEXT: vmovdqa {{.*#+}} xmm5 = [0,16,2,18,4,20,6,22,8,24,10,26,12,28,14,30] +; XOP-NEXT: vpperm %xmm5, %xmm8, %xmm1, %xmm1 +; XOP-NEXT: vpor %xmm6, %xmm4, %xmm4 +; XOP-NEXT: vpandn %xmm4, %xmm7, %xmm6 +; XOP-NEXT: vpmaddubsw %xmm6, %xmm2, %xmm6 +; XOP-NEXT: vpand %xmm4, %xmm7, %xmm4 +; XOP-NEXT: vpmaddubsw %xmm4, %xmm2, %xmm2 ; XOP-NEXT: vpperm %xmm5, %xmm6, %xmm2, %xmm2 ; XOP-NEXT: vpaddb %xmm3, %xmm2, %xmm2 ; XOP-NEXT: vpaddb %xmm0, %xmm1, %xmm0 @@ -2573,16 +2531,14 @@ define <32 x i8> @vec256_i8_signed_reg_mem(<32 x i8> %a1, ptr %a2_addr) nounwind ; AVX512F-NEXT: vpsubb %ymm3, %ymm1, %ymm1 ; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] -; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] -; AVX512F-NEXT: vpmullw %ymm4, %ymm3, %ymm3 -; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-NEXT: vpand %ymm4, %ymm3, %ymm3 -; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] -; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] -; AVX512F-NEXT: vpmullw %ymm2, %ymm1, %ymm1 -; AVX512F-NEXT: vpand %ymm4, %ymm1, %ymm1 -; AVX512F-NEXT: vpackuswb %ymm3, %ymm1, %ymm1 +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpand %ymm2, %ymm3, %ymm4 +; AVX512F-NEXT: vpmaddubsw %ymm4, %ymm1, %ymm4 +; AVX512F-NEXT: vpand %ymm3, %ymm4, %ymm4 +; AVX512F-NEXT: vpandn %ymm2, %ymm3, %ymm2 +; AVX512F-NEXT: vpmaddubsw %ymm2, %ymm1, %ymm1 +; AVX512F-NEXT: vpsllw $8, %ymm1, %ymm1 +; AVX512F-NEXT: vpor %ymm1, %ymm4, %ymm1 ; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: retq ; @@ -2647,48 +2603,44 @@ define <32 x i8> @vec256_i8_signed_reg_mem(<32 x i8> %a1, ptr %a2_addr) nounwind define <32 x i8> @vec256_i8_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind { ; AVX1-LABEL: vec256_i8_signed_mem_mem: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa (%rsi), %xmm2 -; AVX1-NEXT: vmovdqa 16(%rsi), %xmm3 +; AVX1-NEXT: vmovdqa (%rsi), %xmm1 +; AVX1-NEXT: vmovdqa 16(%rsi), %xmm2 ; AVX1-NEXT: vmovdqa (%rdi), %xmm0 -; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX1-NEXT: vpcmpgtb %xmm3, %xmm1, %xmm4 -; AVX1-NEXT: vpcmpgtb %xmm2, %xmm0, %xmm5 -; AVX1-NEXT: vpminsb %xmm2, %xmm0, %xmm6 -; AVX1-NEXT: vpmaxsb %xmm2, %xmm0, %xmm2 +; AVX1-NEXT: vmovdqa 16(%rdi), %xmm3 +; AVX1-NEXT: vpcmpgtb %xmm2, %xmm3, %xmm4 +; AVX1-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm5 +; AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm6 +; AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vpsubb %xmm6, %xmm1, %xmm1 +; AVX1-NEXT: vpminsb %xmm2, %xmm3, %xmm6 +; AVX1-NEXT: vpmaxsb %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vpsubb %xmm6, %xmm2, %xmm2 -; AVX1-NEXT: vpminsb %xmm3, %xmm1, %xmm6 -; AVX1-NEXT: vpmaxsb %xmm3, %xmm1, %xmm3 -; AVX1-NEXT: vpsubb %xmm6, %xmm3, %xmm3 -; AVX1-NEXT: vpsrlw $1, %xmm3, %xmm3 -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] -; AVX1-NEXT: vpand %xmm6, %xmm3, %xmm3 ; AVX1-NEXT: vpsrlw $1, %xmm2, %xmm2 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] ; AVX1-NEXT: vpand %xmm6, %xmm2, %xmm2 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] -; AVX1-NEXT: vpor %xmm7, %xmm5, %xmm5 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX1-NEXT: vpmullw %xmm6, %xmm8, %xmm6 -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255] -; AVX1-NEXT: vpand %xmm6, %xmm8, %xmm6 -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero -; AVX1-NEXT: vpmullw %xmm5, %xmm2, %xmm2 -; AVX1-NEXT: vpand %xmm2, %xmm8, %xmm2 -; AVX1-NEXT: vpackuswb %xmm6, %xmm2, %xmm2 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX1-NEXT: vpor %xmm7, %xmm4, %xmm4 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX1-NEXT: vpmullw %xmm6, %xmm5, %xmm5 -; AVX1-NEXT: vpand %xmm5, %xmm8, %xmm5 -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero -; AVX1-NEXT: vpmullw %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vpand %xmm3, %xmm8, %xmm3 -; AVX1-NEXT: vpackuswb %xmm5, %xmm3, %xmm3 -; AVX1-NEXT: vpaddb %xmm1, %xmm3, %xmm1 -; AVX1-NEXT: vpaddb %xmm0, %xmm2, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm6, %xmm1, %xmm1 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX1-NEXT: vpor %xmm6, %xmm5, %xmm5 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vpand %xmm5, %xmm7, %xmm8 +; AVX1-NEXT: vpmaddubsw %xmm8, %xmm1, %xmm8 +; AVX1-NEXT: vpand %xmm7, %xmm8, %xmm8 +; AVX1-NEXT: vpandn %xmm5, %xmm7, %xmm5 +; AVX1-NEXT: vpmaddubsw %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vpsllw $8, %xmm1, %xmm1 +; AVX1-NEXT: vpor %xmm1, %xmm8, %xmm1 +; AVX1-NEXT: vpor %xmm6, %xmm4, %xmm4 +; AVX1-NEXT: vpand %xmm4, %xmm7, %xmm5 +; AVX1-NEXT: vpmaddubsw %xmm5, %xmm2, %xmm5 +; AVX1-NEXT: vpand %xmm7, %xmm5, %xmm5 +; AVX1-NEXT: vpandn %xmm4, %xmm7, %xmm4 +; AVX1-NEXT: vpmaddubsw %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpsllw $8, %xmm2, %xmm2 +; AVX1-NEXT: vpor %xmm2, %xmm5, %xmm2 +; AVX1-NEXT: vpaddb %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: vec256_i8_signed_mem_mem: @@ -2702,16 +2654,14 @@ define <32 x i8> @vec256_i8_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind ; AVX2-NEXT: vpsubb %ymm3, %ymm1, %ymm1 ; AVX2-NEXT: vpsrlw $1, %ymm1, %ymm1 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] -; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] -; AVX2-NEXT: vpmullw %ymm4, %ymm3, %ymm3 -; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3 -; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] -; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] -; AVX2-NEXT: vpmullw %ymm2, %ymm1, %ymm1 -; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1 -; AVX2-NEXT: vpackuswb %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpand %ymm2, %ymm3, %ymm4 +; AVX2-NEXT: vpmaddubsw %ymm4, %ymm1, %ymm4 +; AVX2-NEXT: vpand %ymm3, %ymm4, %ymm4 +; AVX2-NEXT: vpandn %ymm2, %ymm3, %ymm2 +; AVX2-NEXT: vpmaddubsw %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpsllw $8, %ymm1, %ymm1 +; AVX2-NEXT: vpor %ymm1, %ymm4, %ymm1 ; AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; @@ -2732,23 +2682,20 @@ define <32 x i8> @vec256_i8_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind ; XOP-NEXT: vpcmpeqd %xmm6, %xmm6, %xmm6 ; XOP-NEXT: vpshlb %xmm6, %xmm1, %xmm1 ; XOP-NEXT: vpshlb %xmm6, %xmm0, %xmm0 -; XOP-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; XOP-NEXT: vbroadcastss {{.*#+}} xmm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] -; XOP-NEXT: vpor %xmm7, %xmm5, %xmm5 -; XOP-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; XOP-NEXT: vpmullw %xmm6, %xmm8, %xmm6 -; XOP-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; XOP-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero -; XOP-NEXT: vpmullw %xmm5, %xmm0, %xmm0 -; XOP-NEXT: vmovdqa {{.*#+}} xmm5 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30] -; XOP-NEXT: vpperm %xmm5, %xmm6, %xmm0, %xmm0 -; XOP-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; XOP-NEXT: vpor %xmm7, %xmm4, %xmm4 -; XOP-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; XOP-NEXT: vpmullw %xmm7, %xmm6, %xmm6 -; XOP-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; XOP-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero -; XOP-NEXT: vpmullw %xmm4, %xmm1, %xmm1 +; XOP-NEXT: vbroadcastss {{.*#+}} xmm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; XOP-NEXT: vpor %xmm6, %xmm5, %xmm5 +; XOP-NEXT: vbroadcastss {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255] +; XOP-NEXT: vpandn %xmm5, %xmm7, %xmm8 +; XOP-NEXT: vpmaddubsw %xmm8, %xmm0, %xmm8 +; XOP-NEXT: vpand %xmm5, %xmm7, %xmm5 +; XOP-NEXT: vpmaddubsw %xmm5, %xmm0, %xmm0 +; XOP-NEXT: vmovdqa {{.*#+}} xmm5 = [0,16,2,18,4,20,6,22,8,24,10,26,12,28,14,30] +; XOP-NEXT: vpperm %xmm5, %xmm8, %xmm0, %xmm0 +; XOP-NEXT: vpor %xmm6, %xmm4, %xmm4 +; XOP-NEXT: vpandn %xmm4, %xmm7, %xmm6 +; XOP-NEXT: vpmaddubsw %xmm6, %xmm1, %xmm6 +; XOP-NEXT: vpand %xmm4, %xmm7, %xmm4 +; XOP-NEXT: vpmaddubsw %xmm4, %xmm1, %xmm1 ; XOP-NEXT: vpperm %xmm5, %xmm6, %xmm1, %xmm1 ; XOP-NEXT: vpaddb %xmm3, %xmm1, %xmm1 ; XOP-NEXT: vpaddb %xmm2, %xmm0, %xmm0 @@ -2766,16 +2713,14 @@ define <32 x i8> @vec256_i8_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind ; AVX512F-NEXT: vpsubb %ymm3, %ymm1, %ymm1 ; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] -; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] -; AVX512F-NEXT: vpmullw %ymm4, %ymm3, %ymm3 -; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-NEXT: vpand %ymm4, %ymm3, %ymm3 -; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] -; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] -; AVX512F-NEXT: vpmullw %ymm2, %ymm1, %ymm1 -; AVX512F-NEXT: vpand %ymm4, %ymm1, %ymm1 -; AVX512F-NEXT: vpackuswb %ymm3, %ymm1, %ymm1 +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpand %ymm2, %ymm3, %ymm4 +; AVX512F-NEXT: vpmaddubsw %ymm4, %ymm1, %ymm4 +; AVX512F-NEXT: vpand %ymm3, %ymm4, %ymm4 +; AVX512F-NEXT: vpandn %ymm2, %ymm3, %ymm2 +; AVX512F-NEXT: vpmaddubsw %ymm2, %ymm1, %ymm1 +; AVX512F-NEXT: vpsllw $8, %ymm1, %ymm1 +; AVX512F-NEXT: vpor %ymm1, %ymm4, %ymm1 ; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/min-legal-vector-width.ll b/llvm/test/CodeGen/X86/min-legal-vector-width.ll index ad08ecb35fdef..1413a19b34935 100644 --- a/llvm/test/CodeGen/X86/min-legal-vector-width.ll +++ b/llvm/test/CodeGen/X86/min-legal-vector-width.ll @@ -889,21 +889,18 @@ define dso_local void @mul256(ptr %a, ptr %b, ptr %c) "min-legal-vector-width"=" ; CHECK-SKX-VBMI-NEXT: vmovdqa 32(%rdi), %ymm1 ; CHECK-SKX-VBMI-NEXT: vmovdqa (%rsi), %ymm2 ; CHECK-SKX-VBMI-NEXT: vmovdqa 32(%rsi), %ymm3 -; CHECK-SKX-VBMI-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] -; CHECK-SKX-VBMI-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] -; CHECK-SKX-VBMI-NEXT: vpmullw %ymm4, %ymm5, %ymm4 -; CHECK-SKX-VBMI-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] -; CHECK-SKX-VBMI-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] -; CHECK-SKX-VBMI-NEXT: vpmullw %ymm3, %ymm1, %ymm1 -; CHECK-SKX-VBMI-NEXT: vmovdqa {{.*#+}} ymm3 = [0,2,4,6,8,10,12,14,32,34,36,38,40,42,44,46,16,18,20,22,24,26,28,30,48,50,52,54,56,58,60,62] -; CHECK-SKX-VBMI-NEXT: vpermt2b %ymm4, %ymm3, %ymm1 -; CHECK-SKX-VBMI-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] -; CHECK-SKX-VBMI-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] -; CHECK-SKX-VBMI-NEXT: vpmullw %ymm4, %ymm5, %ymm4 -; CHECK-SKX-VBMI-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] -; CHECK-SKX-VBMI-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] -; CHECK-SKX-VBMI-NEXT: vpmullw %ymm2, %ymm0, %ymm0 -; CHECK-SKX-VBMI-NEXT: vpermt2b %ymm4, %ymm3, %ymm0 +; CHECK-SKX-VBMI-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; CHECK-SKX-VBMI-NEXT: vpandn %ymm3, %ymm4, %ymm5 +; CHECK-SKX-VBMI-NEXT: vpmaddubsw %ymm5, %ymm1, %ymm5 +; CHECK-SKX-VBMI-NEXT: vpand %ymm3, %ymm4, %ymm3 +; CHECK-SKX-VBMI-NEXT: vpmaddubsw %ymm3, %ymm1, %ymm1 +; CHECK-SKX-VBMI-NEXT: vmovdqa {{.*#+}} ymm3 = [0,32,2,34,4,36,6,38,8,40,10,42,12,44,14,46,16,48,18,50,20,52,22,54,24,56,26,58,28,60,30,62] +; CHECK-SKX-VBMI-NEXT: vpermt2b %ymm5, %ymm3, %ymm1 +; CHECK-SKX-VBMI-NEXT: vpandn %ymm2, %ymm4, %ymm5 +; CHECK-SKX-VBMI-NEXT: vpmaddubsw %ymm5, %ymm0, %ymm5 +; CHECK-SKX-VBMI-NEXT: vpand %ymm2, %ymm4, %ymm2 +; CHECK-SKX-VBMI-NEXT: vpmaddubsw %ymm2, %ymm0, %ymm0 +; CHECK-SKX-VBMI-NEXT: vpermt2b %ymm5, %ymm3, %ymm0 ; CHECK-SKX-VBMI-NEXT: vmovdqa %ymm0, (%rdx) ; CHECK-SKX-VBMI-NEXT: vmovdqa %ymm1, 32(%rdx) ; CHECK-SKX-VBMI-NEXT: vzeroupper @@ -915,25 +912,19 @@ define dso_local void @mul256(ptr %a, ptr %b, ptr %c) "min-legal-vector-width"=" ; CHECK-AVX512-NEXT: vmovdqa 32(%rdi), %ymm1 ; CHECK-AVX512-NEXT: vmovdqa (%rsi), %ymm2 ; CHECK-AVX512-NEXT: vmovdqa 32(%rsi), %ymm3 -; CHECK-AVX512-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] -; CHECK-AVX512-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] -; CHECK-AVX512-NEXT: vpmullw %ymm4, %ymm5, %ymm4 -; CHECK-AVX512-NEXT: vpbroadcastw {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; CHECK-AVX512-NEXT: vpand %ymm5, %ymm4, %ymm4 -; CHECK-AVX512-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] -; CHECK-AVX512-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] -; CHECK-AVX512-NEXT: vpmullw %ymm3, %ymm1, %ymm1 -; CHECK-AVX512-NEXT: vpand %ymm5, %ymm1, %ymm1 -; CHECK-AVX512-NEXT: vpackuswb %ymm4, %ymm1, %ymm1 -; CHECK-AVX512-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] -; CHECK-AVX512-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] -; CHECK-AVX512-NEXT: vpmullw %ymm3, %ymm4, %ymm3 -; CHECK-AVX512-NEXT: vpand %ymm5, %ymm3, %ymm3 -; CHECK-AVX512-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] -; CHECK-AVX512-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] -; CHECK-AVX512-NEXT: vpmullw %ymm2, %ymm0, %ymm0 -; CHECK-AVX512-NEXT: vpand %ymm5, %ymm0, %ymm0 -; CHECK-AVX512-NEXT: vpackuswb %ymm3, %ymm0, %ymm0 +; CHECK-AVX512-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; CHECK-AVX512-NEXT: vpand %ymm3, %ymm4, %ymm5 +; CHECK-AVX512-NEXT: vpmaddubsw %ymm5, %ymm1, %ymm5 +; CHECK-AVX512-NEXT: vpandn %ymm3, %ymm4, %ymm3 +; CHECK-AVX512-NEXT: vpmaddubsw %ymm3, %ymm1, %ymm1 +; CHECK-AVX512-NEXT: vpsllw $8, %ymm1, %ymm1 +; CHECK-AVX512-NEXT: vpternlogq $248, %ymm4, %ymm5, %ymm1 +; CHECK-AVX512-NEXT: vpand %ymm2, %ymm4, %ymm3 +; CHECK-AVX512-NEXT: vpmaddubsw %ymm3, %ymm0, %ymm3 +; CHECK-AVX512-NEXT: vpandn %ymm2, %ymm4, %ymm2 +; CHECK-AVX512-NEXT: vpmaddubsw %ymm2, %ymm0, %ymm0 +; CHECK-AVX512-NEXT: vpsllw $8, %ymm0, %ymm0 +; CHECK-AVX512-NEXT: vpternlogq $248, %ymm4, %ymm3, %ymm0 ; CHECK-AVX512-NEXT: vmovdqa %ymm0, (%rdx) ; CHECK-AVX512-NEXT: vmovdqa %ymm1, 32(%rdx) ; CHECK-AVX512-NEXT: vzeroupper @@ -945,21 +936,18 @@ define dso_local void @mul256(ptr %a, ptr %b, ptr %c) "min-legal-vector-width"=" ; CHECK-VBMI-NEXT: vmovdqa 32(%rdi), %ymm1 ; CHECK-VBMI-NEXT: vmovdqa (%rsi), %ymm2 ; CHECK-VBMI-NEXT: vmovdqa 32(%rsi), %ymm3 -; CHECK-VBMI-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] -; CHECK-VBMI-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] -; CHECK-VBMI-NEXT: vpmullw %ymm4, %ymm5, %ymm4 -; CHECK-VBMI-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] -; CHECK-VBMI-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] -; CHECK-VBMI-NEXT: vpmullw %ymm3, %ymm1, %ymm1 -; CHECK-VBMI-NEXT: vmovdqa {{.*#+}} ymm3 = [0,2,4,6,8,10,12,14,32,34,36,38,40,42,44,46,16,18,20,22,24,26,28,30,48,50,52,54,56,58,60,62] -; CHECK-VBMI-NEXT: vpermt2b %ymm4, %ymm3, %ymm1 -; CHECK-VBMI-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] -; CHECK-VBMI-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] -; CHECK-VBMI-NEXT: vpmullw %ymm4, %ymm5, %ymm4 -; CHECK-VBMI-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] -; CHECK-VBMI-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] -; CHECK-VBMI-NEXT: vpmullw %ymm2, %ymm0, %ymm0 -; CHECK-VBMI-NEXT: vpermt2b %ymm4, %ymm3, %ymm0 +; CHECK-VBMI-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; CHECK-VBMI-NEXT: vpandn %ymm3, %ymm4, %ymm5 +; CHECK-VBMI-NEXT: vpmaddubsw %ymm5, %ymm1, %ymm5 +; CHECK-VBMI-NEXT: vpand %ymm3, %ymm4, %ymm3 +; CHECK-VBMI-NEXT: vpmaddubsw %ymm3, %ymm1, %ymm1 +; CHECK-VBMI-NEXT: vmovdqa {{.*#+}} ymm3 = [0,32,2,34,4,36,6,38,8,40,10,42,12,44,14,46,16,48,18,50,20,52,22,54,24,56,26,58,28,60,30,62] +; CHECK-VBMI-NEXT: vpermt2b %ymm5, %ymm3, %ymm1 +; CHECK-VBMI-NEXT: vpandn %ymm2, %ymm4, %ymm5 +; CHECK-VBMI-NEXT: vpmaddubsw %ymm5, %ymm0, %ymm5 +; CHECK-VBMI-NEXT: vpand %ymm2, %ymm4, %ymm2 +; CHECK-VBMI-NEXT: vpmaddubsw %ymm2, %ymm0, %ymm0 +; CHECK-VBMI-NEXT: vpermt2b %ymm5, %ymm3, %ymm0 ; CHECK-VBMI-NEXT: vmovdqa %ymm0, (%rdx) ; CHECK-VBMI-NEXT: vmovdqa %ymm1, 32(%rdx) ; CHECK-VBMI-NEXT: vzeroupper @@ -976,14 +964,13 @@ define dso_local void @mul512(ptr %a, ptr %b, ptr %c) "min-legal-vector-width"=" ; CHECK-SKX-VBMI: # %bb.0: ; CHECK-SKX-VBMI-NEXT: vmovdqa64 (%rdi), %zmm0 ; CHECK-SKX-VBMI-NEXT: vmovdqa64 (%rsi), %zmm1 -; CHECK-SKX-VBMI-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] -; CHECK-SKX-VBMI-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] -; CHECK-SKX-VBMI-NEXT: vpmullw %zmm2, %zmm3, %zmm2 -; CHECK-SKX-VBMI-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] -; CHECK-SKX-VBMI-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] -; CHECK-SKX-VBMI-NEXT: vpmullw %zmm1, %zmm0, %zmm0 -; CHECK-SKX-VBMI-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,2,4,6,8,10,12,14,64,66,68,70,72,74,76,78,16,18,20,22,24,26,28,30,80,82,84,86,88,90,92,94,32,34,36,38,40,42,44,46,96,98,100,102,104,106,108,110,48,50,52,54,56,58,60,62,112,114,116,118,120,122,124,126] -; CHECK-SKX-VBMI-NEXT: vpermi2b %zmm2, %zmm0, %zmm1 +; CHECK-SKX-VBMI-NEXT: vpbroadcastw {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; CHECK-SKX-VBMI-NEXT: vpandnq %zmm1, %zmm2, %zmm3 +; CHECK-SKX-VBMI-NEXT: vpmaddubsw %zmm3, %zmm0, %zmm3 +; CHECK-SKX-VBMI-NEXT: vpandq %zmm1, %zmm2, %zmm1 +; CHECK-SKX-VBMI-NEXT: vpmaddubsw %zmm1, %zmm0, %zmm0 +; CHECK-SKX-VBMI-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,64,2,66,4,68,6,70,8,72,10,74,12,76,14,78,16,80,18,82,20,84,22,86,24,88,26,90,28,92,30,94,32,96,34,98,36,100,38,102,40,104,42,106,44,108,46,110,48,112,50,114,52,116,54,118,56,120,58,122,60,124,62,126] +; CHECK-SKX-VBMI-NEXT: vpermi2b %zmm3, %zmm0, %zmm1 ; CHECK-SKX-VBMI-NEXT: vmovdqa64 %zmm1, (%rdx) ; CHECK-SKX-VBMI-NEXT: vzeroupper ; CHECK-SKX-VBMI-NEXT: retq @@ -992,16 +979,13 @@ define dso_local void @mul512(ptr %a, ptr %b, ptr %c) "min-legal-vector-width"=" ; CHECK-AVX512: # %bb.0: ; CHECK-AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 ; CHECK-AVX512-NEXT: vmovdqa64 (%rsi), %zmm1 -; CHECK-AVX512-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] -; CHECK-AVX512-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] -; CHECK-AVX512-NEXT: vpmullw %zmm2, %zmm3, %zmm2 -; CHECK-AVX512-NEXT: vpbroadcastw {{.*#+}} zmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; CHECK-AVX512-NEXT: vpandq %zmm3, %zmm2, %zmm2 -; CHECK-AVX512-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] -; CHECK-AVX512-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] -; CHECK-AVX512-NEXT: vpmullw %zmm1, %zmm0, %zmm0 -; CHECK-AVX512-NEXT: vpandq %zmm3, %zmm0, %zmm0 -; CHECK-AVX512-NEXT: vpackuswb %zmm2, %zmm0, %zmm0 +; CHECK-AVX512-NEXT: vpbroadcastw {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; CHECK-AVX512-NEXT: vpandq %zmm1, %zmm2, %zmm3 +; CHECK-AVX512-NEXT: vpmaddubsw %zmm3, %zmm0, %zmm3 +; CHECK-AVX512-NEXT: vpandnq %zmm1, %zmm2, %zmm1 +; CHECK-AVX512-NEXT: vpmaddubsw %zmm1, %zmm0, %zmm0 +; CHECK-AVX512-NEXT: vpsllw $8, %zmm0, %zmm0 +; CHECK-AVX512-NEXT: vpternlogq $248, %zmm2, %zmm3, %zmm0 ; CHECK-AVX512-NEXT: vmovdqa64 %zmm0, (%rdx) ; CHECK-AVX512-NEXT: vzeroupper ; CHECK-AVX512-NEXT: retq @@ -1010,14 +994,13 @@ define dso_local void @mul512(ptr %a, ptr %b, ptr %c) "min-legal-vector-width"=" ; CHECK-VBMI: # %bb.0: ; CHECK-VBMI-NEXT: vmovdqa64 (%rdi), %zmm0 ; CHECK-VBMI-NEXT: vmovdqa64 (%rsi), %zmm1 -; CHECK-VBMI-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] -; CHECK-VBMI-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] -; CHECK-VBMI-NEXT: vpmullw %zmm2, %zmm3, %zmm2 -; CHECK-VBMI-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] -; CHECK-VBMI-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] -; CHECK-VBMI-NEXT: vpmullw %zmm1, %zmm0, %zmm0 -; CHECK-VBMI-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,2,4,6,8,10,12,14,64,66,68,70,72,74,76,78,16,18,20,22,24,26,28,30,80,82,84,86,88,90,92,94,32,34,36,38,40,42,44,46,96,98,100,102,104,106,108,110,48,50,52,54,56,58,60,62,112,114,116,118,120,122,124,126] -; CHECK-VBMI-NEXT: vpermi2b %zmm2, %zmm0, %zmm1 +; CHECK-VBMI-NEXT: vpbroadcastw {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; CHECK-VBMI-NEXT: vpandnq %zmm1, %zmm2, %zmm3 +; CHECK-VBMI-NEXT: vpmaddubsw %zmm3, %zmm0, %zmm3 +; CHECK-VBMI-NEXT: vpandq %zmm1, %zmm2, %zmm1 +; CHECK-VBMI-NEXT: vpmaddubsw %zmm1, %zmm0, %zmm0 +; CHECK-VBMI-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,64,2,66,4,68,6,70,8,72,10,74,12,76,14,78,16,80,18,82,20,84,22,86,24,88,26,90,28,92,30,94,32,96,34,98,36,100,38,102,40,104,42,106,44,108,46,110,48,112,50,114,52,116,54,118,56,120,58,122,60,124,62,126] +; CHECK-VBMI-NEXT: vpermi2b %zmm3, %zmm0, %zmm1 ; CHECK-VBMI-NEXT: vmovdqa64 %zmm1, (%rdx) ; CHECK-VBMI-NEXT: vzeroupper ; CHECK-VBMI-NEXT: retq diff --git a/llvm/test/CodeGen/X86/pmul.ll b/llvm/test/CodeGen/X86/pmul.ll index 26272b62e5fb8..fe791e45eff99 100644 --- a/llvm/test/CodeGen/X86/pmul.ll +++ b/llvm/test/CodeGen/X86/pmul.ll @@ -160,17 +160,16 @@ define <16 x i8> @mul_v16i8(<16 x i8> %i, <16 x i8> %j) nounwind { ; ; SSE41-LABEL: mul_v16i8: ; SSE41: # %bb.0: # %entry -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; SSE41-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE41-NEXT: pmullw %xmm1, %xmm0 -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] -; SSE41-NEXT: pand %xmm1, %xmm0 -; SSE41-NEXT: pmullw %xmm3, %xmm2 -; SSE41-NEXT: pand %xmm1, %xmm2 -; SSE41-NEXT: packuswb %xmm0, %xmm2 -; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] +; SSE41-NEXT: movdqa %xmm2, %xmm3 +; SSE41-NEXT: pand %xmm1, %xmm3 +; SSE41-NEXT: movdqa %xmm0, %xmm4 +; SSE41-NEXT: pmaddubsw %xmm3, %xmm4 +; SSE41-NEXT: pand %xmm2, %xmm4 +; SSE41-NEXT: pandn %xmm1, %xmm2 +; SSE41-NEXT: pmaddubsw %xmm2, %xmm0 +; SSE41-NEXT: psllw $8, %xmm0 +; SSE41-NEXT: por %xmm4, %xmm0 ; SSE41-NEXT: retq ; ; AVX2-LABEL: mul_v16i8: @@ -585,55 +584,50 @@ define <32 x i8> @mul_v32i8(<32 x i8> %i, <32 x i8> %j) nounwind { ; ; SSE41-LABEL: mul_v32i8: ; SSE41: # %bb.0: # %entry -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm5 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero -; SSE41-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE41-NEXT: pmullw %xmm2, %xmm0 -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255] -; SSE41-NEXT: pand %xmm6, %xmm0 -; SSE41-NEXT: pmullw %xmm5, %xmm4 -; SSE41-NEXT: pand %xmm6, %xmm4 -; SSE41-NEXT: packuswb %xmm0, %xmm4 -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero -; SSE41-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; SSE41-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE41-NEXT: pmullw %xmm3, %xmm1 -; SSE41-NEXT: pand %xmm6, %xmm1 -; SSE41-NEXT: pmullw %xmm0, %xmm2 -; SSE41-NEXT: pand %xmm6, %xmm2 -; SSE41-NEXT: packuswb %xmm1, %xmm2 -; SSE41-NEXT: movdqa %xmm4, %xmm0 -; SSE41-NEXT: movdqa %xmm2, %xmm1 +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] +; SSE41-NEXT: movdqa %xmm4, %xmm5 +; SSE41-NEXT: pand %xmm2, %xmm5 +; SSE41-NEXT: movdqa %xmm0, %xmm6 +; SSE41-NEXT: pmaddubsw %xmm5, %xmm6 +; SSE41-NEXT: pand %xmm4, %xmm6 +; SSE41-NEXT: movdqa %xmm4, %xmm5 +; SSE41-NEXT: pandn %xmm2, %xmm5 +; SSE41-NEXT: pmaddubsw %xmm5, %xmm0 +; SSE41-NEXT: psllw $8, %xmm0 +; SSE41-NEXT: por %xmm6, %xmm0 +; SSE41-NEXT: movdqa %xmm4, %xmm2 +; SSE41-NEXT: pand %xmm3, %xmm2 +; SSE41-NEXT: movdqa %xmm1, %xmm5 +; SSE41-NEXT: pmaddubsw %xmm2, %xmm5 +; SSE41-NEXT: pand %xmm4, %xmm5 +; SSE41-NEXT: pandn %xmm3, %xmm4 +; SSE41-NEXT: pmaddubsw %xmm4, %xmm1 +; SSE41-NEXT: psllw $8, %xmm1 +; SSE41-NEXT: por %xmm5, %xmm1 ; SSE41-NEXT: retq ; ; AVX2-LABEL: mul_v32i8: ; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] -; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] -; AVX2-NEXT: vpmullw %ymm2, %ymm3, %ymm2 -; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2 -; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] -; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] -; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm0 -; AVX2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpand %ymm1, %ymm2, %ymm3 +; AVX2-NEXT: vpmaddubsw %ymm3, %ymm0, %ymm3 +; AVX2-NEXT: vpand %ymm2, %ymm3, %ymm3 +; AVX2-NEXT: vpandn %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpmaddubsw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpsllw $8, %ymm0, %ymm0 +; AVX2-NEXT: vpor %ymm0, %ymm3, %ymm0 ; AVX2-NEXT: retq ; ; AVX512F-LABEL: mul_v32i8: ; AVX512F: # %bb.0: # %entry -; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] -; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] -; AVX512F-NEXT: vpmullw %ymm2, %ymm3, %ymm2 -; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm2 -; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] -; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] -; AVX512F-NEXT: vpmullw %ymm1, %ymm0, %ymm0 -; AVX512F-NEXT: vpand %ymm3, %ymm0, %ymm0 -; AVX512F-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpand %ymm1, %ymm2, %ymm3 +; AVX512F-NEXT: vpmaddubsw %ymm3, %ymm0, %ymm3 +; AVX512F-NEXT: vpand %ymm2, %ymm3, %ymm3 +; AVX512F-NEXT: vpandn %ymm1, %ymm2, %ymm1 +; AVX512F-NEXT: vpmaddubsw %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vpsllw $8, %ymm0, %ymm0 +; AVX512F-NEXT: vpor %ymm0, %ymm3, %ymm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: mul_v32i8: @@ -906,110 +900,96 @@ define <64 x i8> @mul_v64i8(<64 x i8> %i, <64 x i8> %j) nounwind { ; ; SSE41-LABEL: mul_v64i8: ; SSE41: # %bb.0: # %entry -; SSE41-NEXT: movdqa %xmm1, %xmm8 -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm10 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero -; SSE41-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; SSE41-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE41-NEXT: pmullw %xmm4, %xmm1 -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm9 = [255,255,255,255,255,255,255,255] -; SSE41-NEXT: pand %xmm9, %xmm1 -; SSE41-NEXT: pmullw %xmm10, %xmm0 -; SSE41-NEXT: pand %xmm9, %xmm0 -; SSE41-NEXT: packuswb %xmm1, %xmm0 -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm4 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero -; SSE41-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm8[0],zero,xmm8[1],zero,xmm8[2],zero,xmm8[3],zero,xmm8[4],zero,xmm8[5],zero,xmm8[6],zero,xmm8[7],zero -; SSE41-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE41-NEXT: pmullw %xmm5, %xmm8 -; SSE41-NEXT: pand %xmm9, %xmm8 -; SSE41-NEXT: pmullw %xmm4, %xmm1 -; SSE41-NEXT: pand %xmm9, %xmm1 -; SSE41-NEXT: packuswb %xmm8, %xmm1 -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm5 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero -; SSE41-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero -; SSE41-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE41-NEXT: pmullw %xmm6, %xmm2 -; SSE41-NEXT: pand %xmm9, %xmm2 -; SSE41-NEXT: pmullw %xmm5, %xmm4 -; SSE41-NEXT: pand %xmm9, %xmm4 -; SSE41-NEXT: packuswb %xmm2, %xmm4 -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero,xmm7[4],zero,xmm7[5],zero,xmm7[6],zero,xmm7[7],zero -; SSE41-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm5 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero -; SSE41-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE41-NEXT: pmullw %xmm7, %xmm3 -; SSE41-NEXT: pand %xmm9, %xmm3 -; SSE41-NEXT: pmullw %xmm2, %xmm5 -; SSE41-NEXT: pand %xmm9, %xmm5 -; SSE41-NEXT: packuswb %xmm3, %xmm5 -; SSE41-NEXT: movdqa %xmm4, %xmm2 -; SSE41-NEXT: movdqa %xmm5, %xmm3 +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255] +; SSE41-NEXT: movdqa %xmm8, %xmm9 +; SSE41-NEXT: pand %xmm4, %xmm9 +; SSE41-NEXT: movdqa %xmm0, %xmm10 +; SSE41-NEXT: pmaddubsw %xmm9, %xmm10 +; SSE41-NEXT: pand %xmm8, %xmm10 +; SSE41-NEXT: movdqa %xmm8, %xmm9 +; SSE41-NEXT: pandn %xmm4, %xmm9 +; SSE41-NEXT: pmaddubsw %xmm9, %xmm0 +; SSE41-NEXT: psllw $8, %xmm0 +; SSE41-NEXT: por %xmm10, %xmm0 +; SSE41-NEXT: movdqa %xmm8, %xmm4 +; SSE41-NEXT: pand %xmm5, %xmm4 +; SSE41-NEXT: movdqa %xmm1, %xmm9 +; SSE41-NEXT: pmaddubsw %xmm4, %xmm9 +; SSE41-NEXT: pand %xmm8, %xmm9 +; SSE41-NEXT: movdqa %xmm8, %xmm4 +; SSE41-NEXT: pandn %xmm5, %xmm4 +; SSE41-NEXT: pmaddubsw %xmm4, %xmm1 +; SSE41-NEXT: psllw $8, %xmm1 +; SSE41-NEXT: por %xmm9, %xmm1 +; SSE41-NEXT: movdqa %xmm8, %xmm4 +; SSE41-NEXT: pand %xmm6, %xmm4 +; SSE41-NEXT: movdqa %xmm2, %xmm5 +; SSE41-NEXT: pmaddubsw %xmm4, %xmm5 +; SSE41-NEXT: pand %xmm8, %xmm5 +; SSE41-NEXT: movdqa %xmm8, %xmm4 +; SSE41-NEXT: pandn %xmm6, %xmm4 +; SSE41-NEXT: pmaddubsw %xmm4, %xmm2 +; SSE41-NEXT: psllw $8, %xmm2 +; SSE41-NEXT: por %xmm5, %xmm2 +; SSE41-NEXT: movdqa %xmm8, %xmm4 +; SSE41-NEXT: pand %xmm7, %xmm4 +; SSE41-NEXT: movdqa %xmm3, %xmm5 +; SSE41-NEXT: pmaddubsw %xmm4, %xmm5 +; SSE41-NEXT: pand %xmm8, %xmm5 +; SSE41-NEXT: pandn %xmm7, %xmm8 +; SSE41-NEXT: pmaddubsw %xmm8, %xmm3 +; SSE41-NEXT: psllw $8, %xmm3 +; SSE41-NEXT: por %xmm5, %xmm3 ; SSE41-NEXT: retq ; ; AVX2-LABEL: mul_v64i8: ; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] -; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] -; AVX2-NEXT: vpmullw %ymm4, %ymm5, %ymm4 -; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpand %ymm5, %ymm4, %ymm4 -; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] -; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] -; AVX2-NEXT: vpmullw %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpand %ymm5, %ymm0, %ymm0 -; AVX2-NEXT: vpackuswb %ymm4, %ymm0, %ymm0 -; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] -; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] -; AVX2-NEXT: vpmullw %ymm2, %ymm4, %ymm2 -; AVX2-NEXT: vpand %ymm5, %ymm2, %ymm2 -; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] -; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] -; AVX2-NEXT: vpmullw %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpand %ymm5, %ymm1, %ymm1 -; AVX2-NEXT: vpackuswb %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpand %ymm2, %ymm4, %ymm5 +; AVX2-NEXT: vpmaddubsw %ymm5, %ymm0, %ymm5 +; AVX2-NEXT: vpand %ymm4, %ymm5, %ymm5 +; AVX2-NEXT: vpandn %ymm2, %ymm4, %ymm2 +; AVX2-NEXT: vpmaddubsw %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpsllw $8, %ymm0, %ymm0 +; AVX2-NEXT: vpor %ymm0, %ymm5, %ymm0 +; AVX2-NEXT: vpand %ymm3, %ymm4, %ymm2 +; AVX2-NEXT: vpmaddubsw %ymm2, %ymm1, %ymm2 +; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2 +; AVX2-NEXT: vpandn %ymm3, %ymm4, %ymm3 +; AVX2-NEXT: vpmaddubsw %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: vpsllw $8, %ymm1, %ymm1 +; AVX2-NEXT: vpor %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: retq ; ; AVX512F-LABEL: mul_v64i8: ; AVX512F: # %bb.0: # %entry ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 -; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm4 -; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] -; AVX512F-NEXT: vpmullw %ymm3, %ymm5, %ymm3 -; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-NEXT: vpand %ymm5, %ymm3, %ymm3 -; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] -; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] -; AVX512F-NEXT: vpmullw %ymm2, %ymm4, %ymm2 -; AVX512F-NEXT: vpand %ymm5, %ymm2, %ymm2 -; AVX512F-NEXT: vpackuswb %ymm3, %ymm2, %ymm2 -; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] -; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] -; AVX512F-NEXT: vpmullw %ymm3, %ymm4, %ymm3 -; AVX512F-NEXT: vpand %ymm5, %ymm3, %ymm3 -; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] -; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] -; AVX512F-NEXT: vpmullw %ymm1, %ymm0, %ymm0 -; AVX512F-NEXT: vpand %ymm5, %ymm0, %ymm0 -; AVX512F-NEXT: vpackuswb %ymm3, %ymm0, %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm3 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] +; AVX512F-NEXT: vpand %ymm2, %ymm3, %ymm4 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm5 +; AVX512F-NEXT: vpmaddubsw %ymm4, %ymm5, %ymm4 +; AVX512F-NEXT: vpand %ymm1, %ymm3, %ymm6 +; AVX512F-NEXT: vpmaddubsw %ymm6, %ymm0, %ymm6 +; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm6, %zmm4 +; AVX512F-NEXT: vpandn %ymm1, %ymm3, %ymm1 +; AVX512F-NEXT: vpmaddubsw %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vpsllw $8, %ymm0, %ymm0 +; AVX512F-NEXT: vpandn %ymm2, %ymm3, %ymm1 +; AVX512F-NEXT: vpmaddubsw %ymm1, %ymm5, %ymm1 +; AVX512F-NEXT: vpsllw $8, %ymm1, %ymm1 +; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512F-NEXT: vpternlogq $248, %zmm3, %zmm4, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: mul_v64i8: ; AVX512BW: # %bb.0: # %entry -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] -; AVX512BW-NEXT: vpmullw %zmm2, %zmm3, %zmm2 -; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512BW-NEXT: vpandq %zmm3, %zmm2, %zmm2 -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] -; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpandq %zmm3, %zmm0, %zmm0 -; AVX512BW-NEXT: vpackuswb %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512BW-NEXT: vpandq %zmm1, %zmm2, %zmm3 +; AVX512BW-NEXT: vpmaddubsw %zmm3, %zmm0, %zmm3 +; AVX512BW-NEXT: vpandnq %zmm1, %zmm2, %zmm1 +; AVX512BW-NEXT: vpmaddubsw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpsllw $8, %zmm0, %zmm0 +; AVX512BW-NEXT: vpternlogq $248, %zmm2, %zmm3, %zmm0 ; AVX512BW-NEXT: retq entry: %A = mul <64 x i8> %i, %j diff --git a/llvm/test/CodeGen/X86/prefer-avx256-wide-mul.ll b/llvm/test/CodeGen/X86/prefer-avx256-wide-mul.ll index c023f129d7565..a950a13b0d8ca 100644 --- a/llvm/test/CodeGen/X86/prefer-avx256-wide-mul.ll +++ b/llvm/test/CodeGen/X86/prefer-avx256-wide-mul.ll @@ -58,16 +58,13 @@ define <32 x i8> @test_div7_32i8(<32 x i8> %a) { define <32 x i8> @test_mul_32i8(<32 x i8> %a, <32 x i8> %b) { ; AVX256BW-LABEL: test_mul_32i8: ; AVX256BW: # %bb.0: -; AVX256BW-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] -; AVX256BW-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] -; AVX256BW-NEXT: vpmullw %ymm2, %ymm3, %ymm2 -; AVX256BW-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX256BW-NEXT: vpand %ymm3, %ymm2, %ymm2 -; AVX256BW-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] -; AVX256BW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] -; AVX256BW-NEXT: vpmullw %ymm1, %ymm0, %ymm0 -; AVX256BW-NEXT: vpand %ymm3, %ymm0, %ymm0 -; AVX256BW-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 +; AVX256BW-NEXT: vpbroadcastw {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX256BW-NEXT: vpand %ymm1, %ymm2, %ymm3 +; AVX256BW-NEXT: vpmaddubsw %ymm3, %ymm0, %ymm3 +; AVX256BW-NEXT: vpandn %ymm1, %ymm2, %ymm1 +; AVX256BW-NEXT: vpmaddubsw %ymm1, %ymm0, %ymm0 +; AVX256BW-NEXT: vpsllw $8, %ymm0, %ymm0 +; AVX256BW-NEXT: vpternlogq $248, %ymm2, %ymm3, %ymm0 ; AVX256BW-NEXT: retq ; ; AVX512BWVL-LABEL: test_mul_32i8: diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-sse41.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-sse41.ll index 50be3c5bc6c06..efe34c52b3710 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-combining-sse41.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-sse41.ll @@ -62,35 +62,37 @@ define <16 x i8> @PR50049(ptr %p1, ptr %p2) { ; SSE-LABEL: PR50049: ; SSE: # %bb.0: ; SSE-NEXT: movdqa (%rdi), %xmm2 -; SSE-NEXT: movdqa 16(%rdi), %xmm0 -; SSE-NEXT: movdqa 32(%rdi), %xmm1 +; SSE-NEXT: movdqa 16(%rdi), %xmm3 +; SSE-NEXT: movdqa 32(%rdi), %xmm0 ; SSE-NEXT: movdqa (%rsi), %xmm4 ; SSE-NEXT: movdqa 16(%rsi), %xmm5 -; SSE-NEXT: movdqa 32(%rsi), %xmm3 +; SSE-NEXT: movdqa 32(%rsi), %xmm1 ; SSE-NEXT: movdqa {{.*#+}} xmm6 = [128,128,128,128,128,128,2,5,8,11,14,u,u,u,u,u] -; SSE-NEXT: pshufb %xmm6, %xmm0 +; SSE-NEXT: pshufb %xmm6, %xmm3 ; SSE-NEXT: movdqa {{.*#+}} xmm7 = [0,3,6,9,12,15,128,128,128,128,128,u,u,u,u,u] ; SSE-NEXT: pshufb %xmm7, %xmm2 -; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: por %xmm3, %xmm2 +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,8,9,10,128,128,128,128,128] +; SSE-NEXT: pshufb %xmm3, %xmm2 +; SSE-NEXT: movdqa {{.*#+}} xmm8 = [128,128,128,128,128,128,128,128,128,128,128,1,4,7,10,13] +; SSE-NEXT: pshufb %xmm8, %xmm0 +; SSE-NEXT: por %xmm2, %xmm0 ; SSE-NEXT: pshufb %xmm6, %xmm5 ; SSE-NEXT: pshufb %xmm7, %xmm4 ; SSE-NEXT: por %xmm5, %xmm4 -; SSE-NEXT: pmovzxbw {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero -; SSE-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero -; SSE-NEXT: pmullw %xmm5, %xmm0 -; SSE-NEXT: pmovzxbw {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255] -; SSE-NEXT: pand %xmm5, %xmm0 -; SSE-NEXT: pmovzxbw {{.*#+}} xmm6 = [8,9,10,128,128,128,128,128] -; SSE-NEXT: pshufb %xmm6, %xmm4 -; SSE-NEXT: pmovzxbw {{.*#+}} xmm7 = [128,128,128,1,4,7,10,13] -; SSE-NEXT: pshufb %xmm7, %xmm3 -; SSE-NEXT: por %xmm4, %xmm3 -; SSE-NEXT: pshufb %xmm6, %xmm2 -; SSE-NEXT: pshufb %xmm7, %xmm1 -; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: pmullw %xmm3, %xmm1 -; SSE-NEXT: pand %xmm5, %xmm1 -; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: pshufb %xmm3, %xmm4 +; SSE-NEXT: pshufb %xmm8, %xmm1 +; SSE-NEXT: por %xmm4, %xmm1 +; SSE-NEXT: pmovzxbw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] +; SSE-NEXT: movdqa %xmm2, %xmm3 +; SSE-NEXT: pand %xmm1, %xmm3 +; SSE-NEXT: movdqa %xmm0, %xmm4 +; SSE-NEXT: pmaddubsw %xmm3, %xmm4 +; SSE-NEXT: pand %xmm2, %xmm4 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: pmaddubsw %xmm2, %xmm0 +; SSE-NEXT: psllw $8, %xmm0 +; SSE-NEXT: por %xmm4, %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: PR50049: @@ -110,23 +112,21 @@ define <16 x i8> @PR50049(ptr %p1, ptr %p2) { ; AVX1-NEXT: vpshufb %xmm4, %xmm2, %xmm2 ; AVX1-NEXT: vpor %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [5,6,7,8,9,10,128,128,128,128,128,0,1,2,3,4] -; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [128,128,128,128,128,128,2,5,8,11,14,128,128,128,128,128] -; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm5 -; AVX1-NEXT: vpor %xmm5, %xmm2, %xmm2 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [128,128,128,128,128,128,2,5,8,11,14,128,128,128,128,128] ; AVX1-NEXT: vpshufb %xmm4, %xmm1, %xmm1 ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX1-NEXT: vpmullw %xmm5, %xmm1, %xmm1 -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] -; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX1-NEXT: vpmullw %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm1 +; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm2 +; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vpand %xmm1, %xmm2, %xmm3 +; AVX1-NEXT: vpmaddubsw %xmm3, %xmm0, %xmm3 +; AVX1-NEXT: vpand %xmm2, %xmm3, %xmm3 +; AVX1-NEXT: vpandn %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpmaddubsw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsllw $8, %xmm0, %xmm0 +; AVX1-NEXT: vpor %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: PR50049: