Skip to content

Commit

Permalink
[x86] try harder to form 256-bit unpck*
Browse files Browse the repository at this point in the history
This is another part of a problem noted in PR42024:
https://bugs.llvm.org/show_bug.cgi?id=42024

The AVX2 code may use awkward 256-bit shuffles vs. the AVX code that gets split
into the expected 128-bit unpack instructions. We have to be selective in
matching the types where we try to do this though. Otherwise, we can end up
with more instructions (in the case of v8x32/v4x64).

Differential Revision: https://reviews.llvm.org/D72575
  • Loading branch information
rotateright committed Jan 17, 2020
1 parent 2d5bfc6 commit 43f60e6
Show file tree
Hide file tree
Showing 4 changed files with 93 additions and 27 deletions.
45 changes: 43 additions & 2 deletions llvm/lib/Target/X86/X86ISelLowering.cpp
Expand Up @@ -10929,6 +10929,32 @@ static SDValue lowerShuffleWithUNPCK(const SDLoc &DL, MVT VT,
return SDValue();
}

/// Check if the mask can be mapped to a preliminary shuffle (vperm 64-bit)
/// followed by unpack 256-bit.
static SDValue lowerShuffleWithUNPCK256(const SDLoc &DL, MVT VT,
ArrayRef<int> Mask, SDValue V1,
SDValue V2, SelectionDAG &DAG) {
SmallVector<int, 32> Unpckl, Unpckh;
createSplat2ShuffleMask(VT, Unpckl, /* Lo */ true);
createSplat2ShuffleMask(VT, Unpckh, /* Lo */ false);

unsigned UnpackOpcode;
if (isShuffleEquivalent(V1, V2, Mask, Unpckl))
UnpackOpcode = X86ISD::UNPCKL;
else if (isShuffleEquivalent(V1, V2, Mask, Unpckh))
UnpackOpcode = X86ISD::UNPCKH;
else
return SDValue();

// This is a "natural" unpack operation (rather than the 128-bit sectored
// operation implemented by AVX). We need to rearrange 64-bit chunks of the
// input in order to use the x86 instruction.
V1 = DAG.getVectorShuffle(MVT::v4f64, DL, DAG.getBitcast(MVT::v4f64, V1),
DAG.getUNDEF(MVT::v4f64), {0, 2, 1, 3});
V1 = DAG.getBitcast(VT, V1);
return DAG.getNode(UnpackOpcode, DL, VT, V1, V1);
}

static bool matchShuffleAsVPMOV(ArrayRef<int> Mask, bool SwappedOps,
int Delta) {
int Size = (int)Mask.size();
Expand Down Expand Up @@ -16210,9 +16236,14 @@ static SDValue lowerV8I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
return V;

// If the shuffle patterns aren't repeated but it is a single input, directly
// generate a cross-lane VPERMD instruction.
if (V2.isUndef()) {
// Try to produce a fixed cross-128-bit lane permute followed by unpack
// because that should be faster than the variable permute alternatives.
if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v8i32, Mask, V1, V2, DAG))
return V;

// If the shuffle patterns aren't repeated but it's a single input, directly
// generate a cross-lane VPERMD instruction.
SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8i32, VPermMask, V1);
}
Expand Down Expand Up @@ -16294,6 +16325,11 @@ static SDValue lowerV16I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
return V;

if (V2.isUndef()) {
// Try to produce a fixed cross-128-bit lane permute followed by unpack
// because that should be faster than the variable permute alternatives.
if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v16i16, Mask, V1, V2, DAG))
return V;

// There are no generalized cross-lane shuffle operations available on i16
// element types.
if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask)) {
Expand Down Expand Up @@ -16396,6 +16432,11 @@ static SDValue lowerV32I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
// There are no generalized cross-lane shuffle operations available on i8
// element types.
if (V2.isUndef() && is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask)) {
// Try to produce a fixed cross-128-bit lane permute followed by unpack
// because that should be faster than the variable permute alternatives.
if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v32i8, Mask, V1, V2, DAG))
return V;

if (SDValue V = lowerShuffleAsLanePermuteAndPermute(
DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))
return V;
Expand Down
15 changes: 15 additions & 0 deletions llvm/lib/Target/X86/X86ISelLowering.h
Expand Up @@ -1688,6 +1688,21 @@ namespace llvm {
}
}

/// Similar to unpacklo/unpackhi, but without the 128-bit lane limitation
/// imposed by AVX and specific to the unary pattern. Example:
/// v8iX Lo --> <0, 0, 1, 1, 2, 2, 3, 3>
/// v8iX Hi --> <4, 4, 5, 5, 6, 6, 7, 7>
template <typename T = int>
void createSplat2ShuffleMask(MVT VT, SmallVectorImpl<T> &Mask, bool Lo) {
assert(Mask.empty() && "Expected an empty shuffle mask vector");
int NumElts = VT.getVectorNumElements();
for (int i = 0; i < NumElts; ++i) {
int Pos = i / 2;
Pos += (Lo ? 0 : NumElts / 2);
Mask.push_back(Pos);
}
}

/// Helper function to scale a shuffle or target shuffle mask, replacing each
/// mask index with the scaled sequential indices for an equivalent narrowed
/// mask. This is the reverse process to canWidenShuffleElements, but can
Expand Down
26 changes: 9 additions & 17 deletions llvm/test/CodeGen/X86/vector-interleave.ll
Expand Up @@ -171,12 +171,9 @@ define void @splat2_i8(<32 x i8>* %s, <64 x i8>* %d) {
;
; AVX2-LABEL: splat2_i8:
; AVX2: # %bb.0:
; AVX2-NEXT: vmovdqu (%rdi), %ymm0
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,1,0,1]
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = mem[0,2,1,3]
; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm0 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
; AVX2-NEXT: vmovdqu %ymm0, 32(%rsi)
; AVX2-NEXT: vmovdqu %ymm1, (%rsi)
; AVX2-NEXT: vzeroupper
Expand Down Expand Up @@ -221,12 +218,9 @@ define void @splat2_i16(<16 x i16>* %s, <32 x i16>* %d) {
;
; AVX2-LABEL: splat2_i16:
; AVX2: # %bb.0:
; AVX2-NEXT: vmovdqu (%rdi), %ymm0
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,1,0,1]
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,0,1,2,3,2,3,4,5,4,5,6,7,6,7,8,9,8,9,10,11,10,11,12,13,12,13,14,15,14,15]
; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = mem[0,2,1,3]
; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm0[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4,4,5,5,6,6,7,7,12,12,13,13,14,14,15,15]
; AVX2-NEXT: vmovdqu %ymm0, 32(%rsi)
; AVX2-NEXT: vmovdqu %ymm1, (%rsi)
; AVX2-NEXT: vzeroupper
Expand Down Expand Up @@ -269,11 +263,9 @@ define void @splat2_i32(<8 x i32>* %s, <16 x i32>* %d) {
;
; AVX2-LABEL: splat2_i32:
; AVX2: # %bb.0:
; AVX2-NEXT: vmovups (%rdi), %ymm0
; AVX2-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,1,1,2,2,3,3]
; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm1
; AVX2-NEXT: vmovaps {{.*#+}} ymm2 = [4,4,5,5,6,6,7,7]
; AVX2-NEXT: vpermps %ymm0, %ymm2, %ymm0
; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = mem[0,2,1,3]
; AVX2-NEXT: vpermilps {{.*#+}} ymm1 = ymm0[0,0,1,1,4,4,5,5]
; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,2,3,3,6,6,7,7]
; AVX2-NEXT: vmovups %ymm0, 32(%rsi)
; AVX2-NEXT: vmovups %ymm1, (%rsi)
; AVX2-NEXT: vzeroupper
Expand Down
34 changes: 26 additions & 8 deletions llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll
Expand Up @@ -1517,11 +1517,29 @@ define <8 x i32> @shuffle_v8i32_00112233(<8 x i32> %a, <8 x i32> %b) {
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v8i32_00112233:
; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,1,1,2,2,3,3]
; AVX2OR512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0
; AVX2OR512VL-NEXT: retq
; AVX2-SLOW-LABEL: shuffle_v8i32_00112233:
; AVX2-SLOW: # %bb.0:
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5]
; AVX2-SLOW-NEXT: retq
;
; AVX2-FAST-LABEL: shuffle_v8i32_00112233:
; AVX2-FAST: # %bb.0:
; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,1,1,2,2,3,3]
; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0
; AVX2-FAST-NEXT: retq
;
; AVX512VL-SLOW-LABEL: shuffle_v8i32_00112233:
; AVX512VL-SLOW: # %bb.0:
; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5]
; AVX512VL-SLOW-NEXT: retq
;
; AVX512VL-FAST-LABEL: shuffle_v8i32_00112233:
; AVX512VL-FAST: # %bb.0:
; AVX512VL-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,1,1,2,2,3,3]
; AVX512VL-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0
; AVX512VL-FAST-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
ret <8 x i32> %shuffle
}
Expand Down Expand Up @@ -1689,18 +1707,18 @@ define <8 x i32> @shuffle_v8i32_08991abb(<8 x i32> %a, <8 x i32> %b) {
;
; AVX2-SLOW-LABEL: shuffle_v8i32_08991abb:
; AVX2-SLOW: # %bb.0:
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = <u,0,1,1,u,2,3,3>
; AVX2-SLOW-NEXT: vpermd %ymm1, %ymm2, %ymm1
; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3]
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3]
; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,1,1,4,4,5,5]
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
; AVX2-SLOW-NEXT: retq
;
; AVX2-FAST-LABEL: shuffle_v8i32_08991abb:
; AVX2-FAST: # %bb.0:
; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm2 = <0,u,1,u,1,u,u,u>
; AVX2-FAST-NEXT: vpermps %ymm0, %ymm2, %ymm0
; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm2 = <u,0,1,1,u,2,3,3>
; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm2 = [0,0,1,1,2,2,3,3]
; AVX2-FAST-NEXT: vpermps %ymm1, %ymm2, %ymm1
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
; AVX2-FAST-NEXT: retq
Expand Down

0 comments on commit 43f60e6

Please sign in to comment.