Skip to content

Commit

Permalink
[AMDGPU] fixed divergence driven shift operations selection
Browse files Browse the repository at this point in the history
Differential Revision: https://reviews.llvm.org/D73483

Reviewers: rampitec
  • Loading branch information
alex-t committed Jan 31, 2020
1 parent ac8da31 commit 5df1ac7
Show file tree
Hide file tree
Showing 17 changed files with 222 additions and 76 deletions.
12 changes: 6 additions & 6 deletions llvm/lib/Target/AMDGPU/SOPInstructions.td
Expand Up @@ -541,22 +541,22 @@ let AddedComplexity = 1 in {
let Defs = [SCC] in {
// TODO: b64 versions require VOP3 change since v_lshlrev_b64 is VOP3
def S_LSHL_B32 : SOP2_32 <"s_lshl_b32",
[(set SReg_32:$sdst, (shl (i32 SSrc_b32:$src0), (i32 SSrc_b32:$src1)))]
[(set SReg_32:$sdst, (UniformBinFrag<shl> (i32 SSrc_b32:$src0), (i32 SSrc_b32:$src1)))]
>;
def S_LSHL_B64 : SOP2_64_32 <"s_lshl_b64",
[(set SReg_64:$sdst, (shl (i64 SSrc_b64:$src0), (i32 SSrc_b32:$src1)))]
[(set SReg_64:$sdst, (UniformBinFrag<shl> (i64 SSrc_b64:$src0), (i32 SSrc_b32:$src1)))]
>;
def S_LSHR_B32 : SOP2_32 <"s_lshr_b32",
[(set SReg_32:$sdst, (srl (i32 SSrc_b32:$src0), (i32 SSrc_b32:$src1)))]
[(set SReg_32:$sdst, (UniformBinFrag<srl> (i32 SSrc_b32:$src0), (i32 SSrc_b32:$src1)))]
>;
def S_LSHR_B64 : SOP2_64_32 <"s_lshr_b64",
[(set SReg_64:$sdst, (srl (i64 SSrc_b64:$src0), (i32 SSrc_b32:$src1)))]
[(set SReg_64:$sdst, (UniformBinFrag<srl> (i64 SSrc_b64:$src0), (i32 SSrc_b32:$src1)))]
>;
def S_ASHR_I32 : SOP2_32 <"s_ashr_i32",
[(set SReg_32:$sdst, (sra (i32 SSrc_b32:$src0), (i32 SSrc_b32:$src1)))]
[(set SReg_32:$sdst, (UniformBinFrag<sra> (i32 SSrc_b32:$src0), (i32 SSrc_b32:$src1)))]
>;
def S_ASHR_I64 : SOP2_64_32 <"s_ashr_i64",
[(set SReg_64:$sdst, (sra (i64 SSrc_b64:$src0), (i32 SSrc_b32:$src1)))]
[(set SReg_64:$sdst, (UniformBinFrag<sra> (i64 SSrc_b64:$src0), (i32 SSrc_b32:$src1)))]
>;
} // End Defs = [SCC]

Expand Down
13 changes: 8 additions & 5 deletions llvm/lib/Target/AMDGPU/VOP2Instructions.td
Expand Up @@ -541,14 +541,17 @@ defm V_MIN_LEGACY_F32 : VOP2Inst <"v_min_legacy_f32", VOP_F32_F32_F32, AMDGPUfmi
defm V_MAX_LEGACY_F32 : VOP2Inst <"v_max_legacy_f32", VOP_F32_F32_F32, AMDGPUfmax_legacy>;
} // End SubtargetPredicate = isGFX6GFX7

let SubtargetPredicate = isGFX6GFX7GFX10 in {
let isCommutable = 1 in {
let SubtargetPredicate = isGFX6GFX7GFX10 in {
defm V_MAC_LEGACY_F32 : VOP2Inst <"v_mac_legacy_f32", VOP_F32_F32_F32>;
defm V_LSHR_B32 : VOP2Inst <"v_lshr_b32", VOP_I32_I32_I32, srl>;
defm V_ASHR_I32 : VOP2Inst <"v_ashr_i32", VOP_I32_I32_I32, sra>;
defm V_LSHL_B32 : VOP2Inst <"v_lshl_b32", VOP_I32_I32_I32, shl>;
} // End isCommutable = 1
} // End SubtargetPredicate = isGFX6GFX7GFX10
let SubtargetPredicate = isGFX6GFX7 in {
defm V_LSHR_B32 : VOP2Inst <"v_lshr_b32", VOP_PAT_GEN<VOP_I32_I32_I32>, srl>;
defm V_ASHR_I32 : VOP2Inst <"v_ashr_i32", VOP_PAT_GEN<VOP_I32_I32_I32>, sra>;
defm V_LSHL_B32 : VOP2Inst <"v_lshl_b32", VOP_PAT_GEN<VOP_I32_I32_I32>, shl>;
} // End SubtargetPredicate = isGFX6GFX7
} // End isCommutable = 1


class DivergentBinOp<SDPatternOperator Op, VOP_Pseudo Inst> :
GCNPat<
Expand Down
4 changes: 3 additions & 1 deletion llvm/lib/Target/AMDGPU/VOP3Instructions.td
Expand Up @@ -385,10 +385,12 @@ def V_TRIG_PREOP_F64 : VOP3Inst <"v_trig_preop_f64", VOP3_Profile<VOP_F64_F64_I3
}

let SchedRW = [Write64Bit] in {
let SubtargetPredicate = isGFX6GFX7GFX10 in {
let SubtargetPredicate = isGFX6GFX7 in {
def V_LSHL_B64 : VOP3Inst <"v_lshl_b64", VOP3_Profile<VOP_I64_I64_I32>, shl>;
def V_LSHR_B64 : VOP3Inst <"v_lshr_b64", VOP3_Profile<VOP_I64_I64_I32>, srl>;
def V_ASHR_I64 : VOP3Inst <"v_ashr_i64", VOP3_Profile<VOP_I64_I64_I32>, sra>;
} // End SubtargetPredicate = isGFX6GFX7
let SubtargetPredicate = isGFX6GFX7GFX10 in {
def V_MULLIT_F32 : VOP3Inst <"v_mullit_f32", VOP3_Profile<VOP_F32_F32_F32_F32>>;
} // End SubtargetPredicate = isGFX6GFX7GFX10

Expand Down
12 changes: 6 additions & 6 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ashr.mir
Expand Up @@ -237,8 +237,8 @@ body: |
; GFX10: $vcc_hi = IMPLICIT_DEF
; GFX10: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GFX10: [[V_ASHR_I64_:%[0-9]+]]:vreg_64 = V_ASHR_I64 [[COPY]], [[COPY1]], implicit $exec
; GFX10: S_ENDPGM 0, implicit [[V_ASHR_I64_]]
; GFX10: [[V_ASHRREV_I64_:%[0-9]+]]:vreg_64 = V_ASHRREV_I64 [[COPY1]], [[COPY]], implicit $exec
; GFX10: S_ENDPGM 0, implicit [[V_ASHRREV_I64_]]
%0:sgpr(s64) = COPY $sgpr0_sgpr1
%1:vgpr(s32) = COPY $vgpr0
%2:vgpr(s64) = G_ASHR %0, %1
Expand Down Expand Up @@ -277,8 +277,8 @@ body: |
; GFX10: $vcc_hi = IMPLICIT_DEF
; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
; GFX10: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr0
; GFX10: [[V_ASHR_I64_:%[0-9]+]]:vreg_64 = V_ASHR_I64 [[COPY]], [[COPY1]], implicit $exec
; GFX10: S_ENDPGM 0, implicit [[V_ASHR_I64_]]
; GFX10: [[V_ASHRREV_I64_:%[0-9]+]]:vreg_64 = V_ASHRREV_I64 [[COPY1]], [[COPY]], implicit $exec
; GFX10: S_ENDPGM 0, implicit [[V_ASHRREV_I64_]]
%0:vgpr(s64) = COPY $vgpr0_vgpr1
%1:sgpr(s32) = COPY $sgpr0
%2:vgpr(s64) = G_ASHR %0, %1
Expand Down Expand Up @@ -317,8 +317,8 @@ body: |
; GFX10: $vcc_hi = IMPLICIT_DEF
; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX10: [[V_ASHR_I64_:%[0-9]+]]:vreg_64 = V_ASHR_I64 [[COPY]], [[COPY1]], implicit $exec
; GFX10: S_ENDPGM 0, implicit [[V_ASHR_I64_]]
; GFX10: [[V_ASHRREV_I64_:%[0-9]+]]:vreg_64 = V_ASHRREV_I64 [[COPY1]], [[COPY]], implicit $exec
; GFX10: S_ENDPGM 0, implicit [[V_ASHRREV_I64_]]
%0:vgpr(s64) = COPY $vgpr0_vgpr1
%1:vgpr(s32) = COPY $vgpr2
%2:vgpr(s64) = G_ASHR %0, %1
Expand Down
12 changes: 6 additions & 6 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-lshr.mir
Expand Up @@ -237,8 +237,8 @@ body: |
; GFX10: $vcc_hi = IMPLICIT_DEF
; GFX10: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GFX10: [[V_LSHR_B64_:%[0-9]+]]:vreg_64 = V_LSHR_B64 [[COPY]], [[COPY1]], implicit $exec
; GFX10: S_ENDPGM 0, implicit [[V_LSHR_B64_]]
; GFX10: [[V_LSHRREV_B64_:%[0-9]+]]:vreg_64 = V_LSHRREV_B64 [[COPY1]], [[COPY]], implicit $exec
; GFX10: S_ENDPGM 0, implicit [[V_LSHRREV_B64_]]
%0:sgpr(s64) = COPY $sgpr0_sgpr1
%1:vgpr(s32) = COPY $vgpr0
%2:vgpr(s64) = G_LSHR %0, %1
Expand Down Expand Up @@ -277,8 +277,8 @@ body: |
; GFX10: $vcc_hi = IMPLICIT_DEF
; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
; GFX10: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr0
; GFX10: [[V_LSHR_B64_:%[0-9]+]]:vreg_64 = V_LSHR_B64 [[COPY]], [[COPY1]], implicit $exec
; GFX10: S_ENDPGM 0, implicit [[V_LSHR_B64_]]
; GFX10: [[V_LSHRREV_B64_:%[0-9]+]]:vreg_64 = V_LSHRREV_B64 [[COPY1]], [[COPY]], implicit $exec
; GFX10: S_ENDPGM 0, implicit [[V_LSHRREV_B64_]]
%0:vgpr(s64) = COPY $vgpr0_vgpr1
%1:sgpr(s32) = COPY $sgpr0
%2:vgpr(s64) = G_LSHR %0, %1
Expand Down Expand Up @@ -317,8 +317,8 @@ body: |
; GFX10: $vcc_hi = IMPLICIT_DEF
; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX10: [[V_LSHR_B64_:%[0-9]+]]:vreg_64 = V_LSHR_B64 [[COPY]], [[COPY1]], implicit $exec
; GFX10: S_ENDPGM 0, implicit [[V_LSHR_B64_]]
; GFX10: [[V_LSHRREV_B64_:%[0-9]+]]:vreg_64 = V_LSHRREV_B64 [[COPY1]], [[COPY]], implicit $exec
; GFX10: S_ENDPGM 0, implicit [[V_LSHRREV_B64_]]
%0:vgpr(s64) = COPY $vgpr0_vgpr1
%1:vgpr(s32) = COPY $vgpr2
%2:vgpr(s64) = G_LSHR %0, %1
Expand Down
12 changes: 6 additions & 6 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-shl.mir
Expand Up @@ -237,8 +237,8 @@ body: |
; GFX10: $vcc_hi = IMPLICIT_DEF
; GFX10: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GFX10: [[V_LSHL_B64_:%[0-9]+]]:vreg_64 = V_LSHL_B64 [[COPY]], [[COPY1]], implicit $exec
; GFX10: S_ENDPGM 0, implicit [[V_LSHL_B64_]]
; GFX10: [[V_LSHLREV_B64_:%[0-9]+]]:vreg_64 = V_LSHLREV_B64 [[COPY1]], [[COPY]], implicit $exec
; GFX10: S_ENDPGM 0, implicit [[V_LSHLREV_B64_]]
%0:sgpr(s64) = COPY $sgpr0_sgpr1
%1:vgpr(s32) = COPY $vgpr0
%2:vgpr(s64) = G_SHL %0, %1
Expand Down Expand Up @@ -277,8 +277,8 @@ body: |
; GFX10: $vcc_hi = IMPLICIT_DEF
; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
; GFX10: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr0
; GFX10: [[V_LSHL_B64_:%[0-9]+]]:vreg_64 = V_LSHL_B64 [[COPY]], [[COPY1]], implicit $exec
; GFX10: S_ENDPGM 0, implicit [[V_LSHL_B64_]]
; GFX10: [[V_LSHLREV_B64_:%[0-9]+]]:vreg_64 = V_LSHLREV_B64 [[COPY1]], [[COPY]], implicit $exec
; GFX10: S_ENDPGM 0, implicit [[V_LSHLREV_B64_]]
%0:vgpr(s64) = COPY $vgpr0_vgpr1
%1:sgpr(s32) = COPY $sgpr0
%2:vgpr(s64) = G_SHL %0, %1
Expand Down Expand Up @@ -317,8 +317,8 @@ body: |
; GFX10: $vcc_hi = IMPLICIT_DEF
; GFX10: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX10: [[V_LSHL_B64_:%[0-9]+]]:vreg_64 = V_LSHL_B64 [[COPY]], [[COPY1]], implicit $exec
; GFX10: S_ENDPGM 0, implicit [[V_LSHL_B64_]]
; GFX10: [[V_LSHLREV_B64_:%[0-9]+]]:vreg_64 = V_LSHLREV_B64 [[COPY1]], [[COPY]], implicit $exec
; GFX10: S_ENDPGM 0, implicit [[V_LSHLREV_B64_]]
%0:vgpr(s64) = COPY $vgpr0_vgpr1
%1:vgpr(s32) = COPY $vgpr2
%2:vgpr(s64) = G_SHL %0, %1
Expand Down
26 changes: 12 additions & 14 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll
Expand Up @@ -2,9 +2,6 @@
; RUN: llc -global-isel -march=amdgcn -mcpu=tonga -amdgpu-dpp-combine=false -verify-machineinstrs < %s | FileCheck -check-prefix=GFX8 %s
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -amdgpu-dpp-combine=false -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s

; XFAIL: *
; FIXME: Merge with DAG test

define amdgpu_kernel void @dpp_test(i32 addrspace(1)* %out, i32 %in1, i32 %in2) {
; GFX8-LABEL: dpp_test:
; GFX8: ; %bb.0:
Expand All @@ -19,6 +16,7 @@ define amdgpu_kernel void @dpp_test(i32 addrspace(1)* %out, i32 %in1, i32 %in2)
; GFX8-NEXT: v_mov_b32_dpp v2, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX10-LABEL: dpp_test:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
Expand All @@ -43,9 +41,10 @@ define amdgpu_kernel void @update_dpp64_test(i64 addrspace(1)* %arg, i64 %in1, i
; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; GFX8-NEXT: v_lshlrev_b64 v[0:1], 3, v[0:1]
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v2, s1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc
; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v2, v0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
; GFX8-NEXT: v_mov_b32_e32 v5, s3
; GFX8-NEXT: v_mov_b32_e32 v4, s2
Expand All @@ -55,21 +54,20 @@ define amdgpu_kernel void @update_dpp64_test(i64 addrspace(1)* %arg, i64 %in1, i
; GFX8-NEXT: v_mov_b32_dpp v4, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[4:5]
; GFX8-NEXT: s_endpgm
;
; GFX10-LABEL: update_dpp64_test:
; GFX10: ; %bb.0:
; GFX10-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX10-NEXT: v_mul_lo_u32 v2, v0, 0
; GFX10-NEXT: v_mul_hi_u32 v3, v0, 8
; GFX10-NEXT: v_mul_lo_u32 v0, v0, 8
; GFX10-NEXT: v_mul_lo_u32 v1, v1, 8
; GFX10-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: v_lshlrev_b64 v[0:1], 3, v[0:1]
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_add_co_u32_e64 v6, vcc_lo, s0, v0
; GFX10-NEXT: v_add3_u32 v1, v1, v2, v3
; GFX10-NEXT: v_mov_b32_e32 v3, s1
; GFX10-NEXT: v_mov_b32_e32 v2, s0
; GFX10-NEXT: v_mov_b32_e32 v5, s3
; GFX10-NEXT: v_mov_b32_e32 v4, s2
; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, s1, v1, vcc_lo
; GFX10-NEXT: v_add_co_u32_e64 v6, vcc_lo, v2, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, v3, v1, vcc_lo
; GFX10-NEXT: global_load_dwordx2 v[2:3], v[6:7], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_mov_b32_dpp v4, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
Expand Down
4 changes: 2 additions & 2 deletions llvm/test/CodeGen/AMDGPU/ashr.v2i16.ll
Expand Up @@ -42,8 +42,8 @@ define amdgpu_kernel void @s_ashr_v2i16(<2 x i16> addrspace(1)* %out, i32, <2 x
; CI-DAG: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 16
; CI: v_ashrrev_i32_e32 v{{[0-9]+}}, 16, [[LHS]]
; CI: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
; CI: v_ashrrev_i32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
; CI: v_ashrrev_i32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
; CI: v_ashr_i32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
; CI: v_ashr_i32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
; CI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
; CI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]], v{{[0-9]+}}
; CI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
Expand Down
14 changes: 10 additions & 4 deletions llvm/test/CodeGen/AMDGPU/bfe-patterns.ll
Expand Up @@ -24,8 +24,11 @@ define amdgpu_kernel void @v_ubfe_sub_i32(i32 addrspace(1)* %out, i32 addrspace(
; GCN: {{buffer|flat}}_load_dword [[WIDTH:v[0-9]+]]
; GCN: v_sub_{{[iu]}}32_e32 [[SUB:v[0-9]+]], vcc, 32, [[WIDTH]]

; GCN-NEXT: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], [[SUB]], [[SRC]]
; GCN-NEXT: v_lshrrev_b32_e32 [[BFE:v[0-9]+]], [[SUB]], [[SHL]]
; SI-NEXT: v_lshl_b32_e32 [[SHL:v[0-9]+]], [[SRC]], [[SUB]]
; SI-NEXT: v_lshr_b32_e32 [[BFE:v[0-9]+]], [[SHL]], [[SUB]]

; VI-NEXT: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], [[SUB]], [[SRC]]
; VI-NEXT: v_lshrrev_b32_e32 [[BFE:v[0-9]+]], [[SUB]], [[SHL]]

; GCN: [[BFE]]
; GCN: [[SHL]]
Expand Down Expand Up @@ -97,8 +100,11 @@ define amdgpu_kernel void @v_sbfe_sub_i32(i32 addrspace(1)* %out, i32 addrspace(
; GCN: {{buffer|flat}}_load_dword [[WIDTH:v[0-9]+]]
; GCN: v_sub_{{[iu]}}32_e32 [[SUB:v[0-9]+]], vcc, 32, [[WIDTH]]

; GCN-NEXT: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], [[SUB]], [[SRC]]
; GCN-NEXT: v_ashrrev_i32_e32 [[BFE:v[0-9]+]], [[SUB]], [[SHL]]
; SI-NEXT: v_lshl_b32_e32 [[SHL:v[0-9]+]], [[SRC]], [[SUB]]
; SI-NEXT: v_ashr_i32_e32 [[BFE:v[0-9]+]], [[SHL]], [[SUB]]

; VI-NEXT: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], [[SUB]], [[SRC]]
; VI-NEXT: v_ashrrev_i32_e32 [[BFE:v[0-9]+]], [[SUB]], [[SHL]]

; GCN: [[BFE]]
; GCN: [[SHL]]
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/AMDGPU/commute-shifts.ll
Expand Up @@ -17,7 +17,7 @@ define amdgpu_ps float @main(float %arg0, float %arg1) #0 {
; SI-NEXT: image_load v2, v0, s[0:7] dmask:0x1 unorm
; SI-NEXT: v_and_b32_e32 v0, 7, v0
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v0, v0, v2
; SI-NEXT: v_lshr_b32_e32 v0, v2, v0
; SI-NEXT: v_and_b32_e32 v0, 1, v0
; SI-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; SI-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc
Expand Down
4 changes: 2 additions & 2 deletions llvm/test/CodeGen/AMDGPU/extract-lowbits.ll
Expand Up @@ -169,8 +169,8 @@ define i32 @bzhi32_d1_indexzext(i32 %val, i8 %numlowbits) nounwind {
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_sub_i32_e32 v1, vcc, 32, v1
; SI-NEXT: v_and_b32_e32 v1, 0xff, v1
; SI-NEXT: v_lshlrev_b32_e32 v0, v1, v0
; SI-NEXT: v_lshrrev_b32_e32 v0, v1, v0
; SI-NEXT: v_lshl_b32_e32 v0, v0, v1
; SI-NEXT: v_lshr_b32_e32 v0, v0, v1
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: bzhi32_d1_indexzext:
Expand Down
7 changes: 4 additions & 3 deletions llvm/test/CodeGen/AMDGPU/inline-asm.ll
@@ -1,5 +1,5 @@
; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck %s
; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck %s
; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck --check-prefix=CHECK --check-prefix=PRE-GFX8 %s
; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefix=CHECK --check-prefix=GFX8 %s

; CHECK-LABEL: {{^}}inline_asm:
; CHECK: s_endpgm
Expand Down Expand Up @@ -241,7 +241,8 @@ entry:
; CHECK: ; def v0
; CHECK: v_mov_b32_e32 v1, v0
; CHECK: ; def v0
; CHECK: v_lshlrev_b32_e32 v{{[0-9]+}}, v0, v1
; PRE-GFX8: v_lshl_b32_e32 v{{[0-9]+}}, v1, v0
; GFX8: v_lshlrev_b32_e32 v{{[0-9]+}}, v0, v1
define amdgpu_kernel void @muliple_def_phys_vgpr() {
entry:
%def0 = call i32 asm sideeffect "; def $0 ", "={v0}"()
Expand Down
14 changes: 8 additions & 6 deletions llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll
@@ -1,5 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s
; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,CIVI %s
; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI,CIVI %s
; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIVI,VI %s
; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIVI,CI %s

Expand Down Expand Up @@ -123,8 +125,8 @@ define amdgpu_kernel void @v_lshr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16>
; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v3
; CI-NEXT: v_and_b32_e32 v2, s8, v2
; CI-NEXT: v_and_b32_e32 v3, s8, v3
; CI-NEXT: v_lshrrev_b32_e32 v2, v3, v2
; CI-NEXT: v_lshrrev_b32_e32 v3, v5, v4
; CI-NEXT: v_lshr_b32_e32 v2, v2, v3
; CI-NEXT: v_lshr_b32_e32 v3, v4, v5
; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; CI-NEXT: v_or_b32_e32 v2, v2, v3
; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
Expand Down Expand Up @@ -490,10 +492,10 @@ define amdgpu_kernel void @v_lshr_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16>
; CI-NEXT: v_and_b32_e32 v4, s8, v4
; CI-NEXT: v_and_b32_e32 v3, s8, v3
; CI-NEXT: v_and_b32_e32 v5, s8, v5
; CI-NEXT: v_lshrrev_b32_e32 v3, v5, v3
; CI-NEXT: v_lshrrev_b32_e32 v5, v9, v7
; CI-NEXT: v_lshrrev_b32_e32 v2, v4, v2
; CI-NEXT: v_lshrrev_b32_e32 v4, v8, v6
; CI-NEXT: v_lshr_b32_e32 v3, v3, v5
; CI-NEXT: v_lshr_b32_e32 v5, v7, v9
; CI-NEXT: v_lshr_b32_e32 v2, v2, v4
; CI-NEXT: v_lshr_b32_e32 v4, v6, v8
; CI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; CI-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; CI-NEXT: v_or_b32_e32 v3, v3, v5
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/AMDGPU/sext-in-reg.ll
Expand Up @@ -577,7 +577,7 @@ define amdgpu_kernel void @v_sext_in_reg_i1_i16(i16 addrspace(3)* %out, i16 addr
; GCN: {{buffer|flat|global}}_load_ushort [[VAL0:v[0-9]+]]
; GCN: {{buffer|flat|global}}_load_ushort [[VAL1:v[0-9]+]]

; SI: v_lshlrev_b32_e32 [[REG:v[0-9]+]], [[VAL1]], [[VAL0]]
; SI: v_lshl_b32_e32 [[REG:v[0-9]+]], [[VAL0]], [[VAL1]]
; GFX89: v_lshlrev_b16_e32 [[REG:v[0-9]+]], [[VAL1]], [[VAL0]]

; GCN: v_bfe_i32 [[BFE:v[0-9]+]], [[REG]], 0, 1{{$}}
Expand Down

0 comments on commit 5df1ac7

Please sign in to comment.